# HG changeset patch # User m-zytnicki # Date 1367503007 14400 # Node ID 2c0c0a89fad78e35e57f9ed69f4bf24a3330ba53 # Parent d22fadc825e34b42191c42cd7409dbfafea87747 Uploaded diff -r d22fadc825e3 -r 2c0c0a89fad7 LICENSE.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE.txt Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,506 @@ + +CeCILL FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat à l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +The purpose of this Free Software license agreement is to grant users +the right to modify and redistribute the software governed by this +license within the framework of an open source distribution model. + +The exercising of these rights is conditional upon certain obligations +for users so as to preserve this status for all subsequent redistributions. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +GNU GPL: means the GNU General Public License version 2 or any +subsequent version, as published by the Free Software Foundation Inc. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows future Licensees unhindered access to +the full Source Code of the Software by indicating how to access it, it +being understood that the additional cost of acquiring the Source Code +shall not exceed the cost of transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +When the Licensee makes a Contribution to the Software, the terms and +conditions for the distribution of the resulting Modified Software +become subject to all the provisions of this Agreement. + +The Licensee is authorized to distribute the Modified Software, in +source code or object code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the object code of the Modified +Software is redistributed, the Licensee allows future Licensees +unhindered access to the full source code of the Modified Software by +indicating how to access it, it being understood that the additional +cost of acquiring the source code shall not exceed the cost of +transferring the data. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 COMPATIBILITY WITH THE GNU GPL + +The Licensee can include a code that is subject to the provisions of one +of the versions of the GNU GPL in the Modified or unmodified Software, +and distribute that entire code under the terms of the same version of +the GNU GPL. + +The Licensee can include the Modified or unmodified Software in a code +that is subject to the provisions of one of the versions of the GNU GPL, +and distribute that entire code under the terms of the same version of +the GNU GPL. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-à-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version, subject to the provisions of Article 5.3.4. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 2.0 dated 2006-09-05. diff -r d22fadc825e3 -r 2c0c0a89fad7 README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,75 @@ +---------- +| NAME | +---------- +S-MART + + +Description +----------- +Several tools are now available for mapping high-throughput sequencing data from a genome, but few can extract biological knowledge from the mapped reads. We have developed a toolbox, S-MART, which handles mapped RNA-Seq and ChIP-Seq data. + +S-MART is an intuitive and lightweight tool, performing several tasks that are usually required during the analysis of mapped RNA-Seq and ChIP-Seq reads, including data selection and data visualization. + +S-MART does not require a computer science background and thus can be used by all biologists through a graphical interface. S-MART can run on any personal computer, yielding results within an hour for most queries. + + +Copyright +--------- +Copyright INRA-URGI 2009-2013 + + +Authors +------- +Matthias Zytnicki + + +Contact +------- +urgi-contact@versailles.inra.fr + + +License +------- +This library is distributed under the terms of the CeCILL license +(http://www.cecill.info/index.en.html). +See the LICENSE.txt file. + + +Installation under Galaxy +------------------------- +S-MART is available under the Galaxy Tool Shed: http://toolshed.g2.bx.psu.edu/ +Remember to set the variables "tool_config_file" and "tool_dependency_dir" accordingly. Please look up the Galaxy Tool Shed wiki to know more about it. +It assumes you have R installed, as well as two packages: RColorBrewer (for colors in graphics), and Hmisc (for statistics). You can install them as root with the commands: + - R --slave --no-save --no-restore --quiet -e 'if("RColorBrewer" %in% rownames(installed.packages()) == FALSE){install.packages("RColorBrewer", repos = c("http://cran.rstudio.com/"), dependencies = TRUE)}' + - R --slave --no-save --no-restore --quiet -e 'if("Hmisc" %in% rownames(installed.packages()) == FALSE){install.packages("Hmisc", repos = c("http://cran.rstudio.com/"), dependencies = TRUE)}' + +Optionally, you can organize the layout of S-MART tools following these instructions. This way, all the tools will be correctly sorted and appear in categories. + - Locate the directory where S-MART has been installed: probably in "/shed_tool/toolshed.g2.bx.psu.edu/repos/yufei-luo/s_mart/XXX/s_mart/" + - Create a symbolic link "/tools/s_mart" directing to "/SMART/galaxy/" + - Paste the content of "/SMART/galaxy/tool_conf.xml" to your local "/tool_conf.xml", for instance, right before the mark-up. + - Remove the S-MART layout in "/shed_tool_conf.xml" (the name may vary depending on your "universe_wgsi.ini" file) which has been automatically generated: remove the whole block between the markup
and the corresponding
. + - Restart Galaxy to complete the install. + + +Stand-alone installation +------------------------ +This product needs the following softwares : + - R, under the GNU General Public License, and several R package (under the same License) + - Python, under the Python License, compatible with the GNU General Public License + - Java, under the GNU General Public License + + +Instructions +------------ +Further installation instructions and the user guide are available in the file "doc.pdf". + + +Acknowledgements +---------------- +Many thanks go helping developers: + - Yufei Luo + - the URGI team +and the beta-testers: + - Claire Toffano-Nioche + - Claire Kuchly + - among others... diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CleanTranscriptFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CleanTranscriptFile.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,74 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from SMART.Java.Python.cleaning.CleanerChooser import CleanerChooser + + +class CleanTranscriptFile(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.chooser = CleanerChooser(self.verbosity) + + def setInputFile(self, fileName, format): + self.chooser.findFormat(format) + self.cleaner = self.chooser.getCleaner() + self.cleaner.setInputFileName(fileName) + + def setOutputFile(self, fileName): + self.cleaner.setOutputFileName(fileName) + + def setAcceptedTypes(self, types): + if types != None: + self.cleaner.setAcceptedTypes(types) + + def run(self): + self.cleaner.clean() + + +if __name__ == "__main__": + + description = "Clean Transcript File v1.0.1: Clean a transcript file so that it is useable for S-MART. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--types", dest="acceptedTypes", action="store", default=None, type="string", help="name of the types you want to keep in GFF/GTF (list separated by commas) [format: string] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + ctf = CleanTranscriptFile(options.verbosity) + ctf.setInputFile(options.inputFileName, options.format) + ctf.setOutputFile(options.outputFileName) + ctf.setAcceptedTypes(None if options.acceptedTypes == None else options.acceptedTypes.split(",")) + ctf.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ClusterizeByTags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ClusterizeByTags.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,157 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + + +OPERATIONS = ("diff", "div") +BOOLTOSTRANDS = {True: [0], False: [-1, 1]} + +class ClusterizeByTags(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.connection = MySqlConnection(self.verbosity-1) + self.defautValue = None + self.maxDistance = None + self.oneStrand = False + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + writer = MySqlTranscriptWriter(self.connection, None, self.verbosity) + writer.addTranscriptList(parser) + writer.write() + self.transcriptTables = writer.getTables() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setTag(self, tagName, defaultValue): + self.tagName = tagName + self.defaultValue = defaultValue + + def setThreshold(self, threshold): + self.threshold = threshold + + def setOperation(self, operation): + self.operation = operation + if self.operation not in OPERATIONS: + raise Exception("Operation '%s' unsupported: choose among %s" % (self.operation, ", ".join(OPERATIONS))) + + def setMaxDistance(self, distance): + self.maxDistance = distance + + def setOneStrand(self, oneStrand): + self.oneStrand = oneStrand + + def run(self): + for chromosome in sorted(self.transcriptTables.keys()): + progress = Progress(self.transcriptTables[chromosome].getNbElements(), "Analyzing %s" % (chromosome), self.verbosity) + for strand in BOOLTOSTRANDS[self.oneStrand]: + previousValue = None + previousTrend = None + previousTranscript = None + sumValue = 0 + command = "SELECT * FROM %s" % (self.transcriptTables[chromosome].getName()) + if not self.oneStrand: + command += " WHERE direction = %d" % (strand) + command += " ORDER BY start, end" + for index, transcript in self.transcriptTables[chromosome].selectTranscripts(command): + if self.tagName in transcript.getTagNames(): + value = transcript.getTagValue(self.tagName) + else: + value = self.defaultValue + if previousValue == None: + trend = None + else: + if self.operation == "diff": + trend = value - previousValue + else: + trend = value / previousValue + if previousTranscript == None: + sumValue = value + elif (previousTrend == None or abs(trend - previousTrend) <= self.threshold) and (self.maxDistance == None or previousTranscript.getDistance(transcript) <= self.maxDistance) and (previousTranscript.getDirection() == transcript.getDirection() or not self.oneStrand): + if previousTranscript.getDirection() != transcript.getDirection(): + transcript.reverse() + previousTranscript.merge(transcript) + transcript = previousTranscript + sumValue += value + previousTrend = trend + else: + previousTranscript.setTagValue(self.tagName, sumValue) + self.writer.addTranscript(previousTranscript) + sumValue = value + previousTrend = None + previousValue = value + previousTranscript = transcript + progress.inc() + if previousTranscript != None: + previousTranscript.setTagValue(self.tagName, sumValue) + self.writer.addTranscript(previousTranscript) + progress.done() + self.writer.close() + + +if __name__ == "__main__": + + description = "Clusterize By Tags v1.0.1: Clusterize a set of element using their tag values. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--tag", dest="tagName", action="store", type="string", help="name of the tag [format: string] [compulsory]") + parser.add_option("-e", "--default", dest="defaultValue", action="store", default=None, type="int", help="default value for the tag [format: string]") + parser.add_option("-r", "--threshold", dest="threshold", action="store", type="int", help="threshold between two consecutive tags [format: int] [compulsory]") + parser.add_option("-p", "--operation", dest="operation", action="store", type="string", help="operation to apply between 2 different clusters to compare them [format: choice (diff, div)] [compulsory]") + parser.add_option("-d", "--distance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance for 2 clusters to be merged [format: int] [default: None]") + parser.add_option("-1", "--oneStrand", dest="oneStrand", action="store_true", default=False, help="also cluster the elements which are on different strands [format: bool] [default: False]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cbt = ClusterizeByTags(options.verbosity) + cbt.setInputFile(options.inputFileName, options.format) + cbt.setOutputFile(options.outputFileName) + cbt.setTag(option.tagName, option.defaultValue) + cbt.setThreshold(option.threshold) + cbt.setOperation(option.operation) + cbt.setMaxDistance(operation.maxDistance) + cbt.setOneStrand(operation.oneStrand) + cbt.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CollapseReads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CollapseReads.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,174 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress + + +class CollapseReads(object): + """ + Merge two reads if they have exactly the same genomic coordinates + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputReader = None + self.outputWriter = None + self.strands = True + self.nbRead = 0 + self.nbWritten = 0 + self.nbMerges = 0 + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) + + def close(self): + self.outputWriter.close() + + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format) + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setOutputFile(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbosity) + + def getNbElements(self): + return self.parser.getNbTranscripts() + + def _sortFile(self): + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbRead = fs.getNbElements() + + def _iterate(self, chromosome): + progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) + transcripts = [] + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + for newTranscript in parser.getIterator(): + newTranscripts = [] + for oldTranscript in transcripts: + if self._checkOverlap(newTranscript, oldTranscript): + self._merge(newTranscript, oldTranscript) + elif self._checkPassed(newTranscript, oldTranscript): + self._write(oldTranscript) + else: + newTranscripts.append(oldTranscript) + newTranscripts.append(newTranscript) + transcripts = newTranscripts + progress.inc() + for transcript in transcripts: + self._write(transcript) + progress.done() + + def _merge(self, transcript1, transcript2): + self.nbMerges += 1 + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2) + + def _write(self, transcript): + self.nbWritten += 1 + self.outputWriter.addTranscript(transcript) + + def _checkOverlap(self, transcript1, transcript2): + if transcript1.getStart() != transcript2.getStart() or transcript1.getEnd() != transcript2.getEnd(): + return False + return (not self.strands or transcript1.getDirection() == transcript2.getDirection()) + + def _checkPassed(self, transcript1, transcript2): + return (transcript2.getStart() < transcript1.getStart()) + + def collapseChromosome(self, chromosome): + progress = Progress(table.getNbElements(), "Analysing chromosome %s" % (chromosome), self.verbosity) + command = "SELECT * FROM %s ORDER BY start ASC, end DESC" % (table.name) + transcriptStart = None + transcriptEnd = None + transcriptDirection = None + currentTranscript = None + if self.strands: + command += ", direction" + for index, transcript in table.selectTranscripts(command, True): + self.nbRead += 1 + if not self.strands: + transcript.setDirection("+") + if transcriptStart != transcript.getStart() or transcriptEnd != transcript.getEnd() or transcriptDirection != transcript.getDirection(): + self.writeTranscript(currentTranscript) + transcriptStart = transcript.getStart() + transcriptEnd = transcript.getEnd() + transcriptDirection = transcript.getDirection() + currentTranscript = transcript + else: + currentTranscript.setTagValue("nbElements", (currentTranscript.getTagValue("nbElements") + 1) if "nbElements" in currentTranscript.getTagNames() else 1) + progress.inc() + self.writeTranscript(currentTranscript) + progress.done() + + def collapse(self): + self._sortFile() + for chromosome in sorted(self.nbElementsPerChromosome.keys()): + self._iterate(chromosome) + self.outputWriter.close() + if self.verbosity > 1: + print "# reads read: %d" % (self.nbRead) + print "# reads written: %d (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbRead * 100) + print "# reads merges: %d" % (self.nbMerges) + +if __name__ == "__main__": + + # parse command line + description = "Collapse Reads v1.0.3: Merge two reads if they have exactly the same genomic coordinates. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="merge elements on 2 different strands [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + collapser = CollapseReads(options.verbosity) + collapser.setInputFile(options.inputFileName, options.format) + collapser.setOutputFile(options.outputFileName) + collapser.strands = not options.strands + collapser.collapse() + collapser.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CombineTags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CombineTags.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer + +OPERATIONS = ("plus", "minus", "times", "div") + +class CombineTags(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + self.inputFileName = fileName + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "transcript") + self.parser = parserChooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbosity) + + def setTags(self, tag1, tag2, outputTag, defaultValue = None): + self.tag1 = tag1 + self.tag2 = tag2 + self.outputTag = outputTag + self.defaultValue = defaultValue + + def setOperation(self, operation): + self.operation = operation + if self.operation not in OPERATIONS: + raise Exception("Do no handle operation %s, only: %s" % (self.operation, ", ".join(OPERATIONS))) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Printing transcripts %s" % (self.inputFileName), self.verbosity) + for transcript in self.parser.getIterator(): + tag1 = transcript.getTagValue(self.tag1) + tag2 = transcript.getTagValue(self.tag2) + if tag1 == None or tag2 == None: + if self.defaultValue == None: + raise Exception("Transcript %s misses one of the tags %s and %s, and has no default value !" % (transcript, self.tag1, self.tag2)) + newTag = self.defaultValue + else: + tag1, tag2 = float(tag1), float(tag2) + if self.operation == "plus": + newTag = tag1 + tag2 + elif self.operation == "minus": + newTag = tag1 - tag2 + elif self.operation == "times": + newTag = tag1 * tag2 + elif self.operation == "div": + newTag = tag1 / tag2 + transcript.setTagValue(self.outputTag, newTag) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + self.parser.close() + self.outputWriter.close() + + +if __name__ == "__main__": + + # parse command line + description = "Change Tag Name v1.0.1: Change the name of tag of a list of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--tag1", dest="tag1", action="store", type="string", help="name of the first tag [compulsory] [format: string]") + parser.add_option("-T", "--tag2", dest="tag2", action="store", type="string", help="name of the second tag [compulsory] [format: string]") + parser.add_option("-d", "--default", dest="defaultValue", action="store", default=None, type="string", help="default value when one of the tag is absent [compulsory] [format: float]") + parser.add_option("-n", "--new", dest="newTag", action="store", type="string", help="name of the new tag [compulsory] [format: string]") + parser.add_option("-p", "--operation", dest="operation", action="store", type="string", help="operation combining the tags [compulsory] [format: choice (plus, minus, times, div)]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + combiner = CombineTags(options.verbosity) + combiner.setInputFile(options.inputFileName, options.inputFormat) + combiner.setOutputFile("%s.gff3" % (options.outputFileName)) + combiner.setTags(options.tag1, options.tag2, options.newTag, options.defaultValue) + combiner.setOperation(options.operation) + combiner.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CompareOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlapping.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,491 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, struct, time, random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCListHandler import NCListHandler +from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc import Utils +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class CompareOverlapping(object): + + def __init__(self, verbosity = 1): + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._nbOverlappingQueries = 0 + self._nbOverlaps = 0 + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._verbosity = verbosity + self._ncLists = {} + self._cursors = {} + self._splittedFileNames = {} + self._nbElements = {} + self._nbElementsPerChromosome = {} + self._inputFileNames = {REFERENCE: None, QUERY: None} + self._inputFileFormats = {REFERENCE: None, QUERY: None} + self._starts = {REFERENCE: None, QUERY: None} + self._ends = {REFERENCE: None, QUERY: None} + self._fivePrimes = {REFERENCE: None, QUERY: None} + self._threePrimes = {REFERENCE: None, QUERY: None} + self._ncListHandlers = {REFERENCE: None, QUERY: None} + self._convertedFileNames = {REFERENCE: False, QUERY: False} + self._sorted = False + self._index = False + self._introns = False + self._antisense = False + self._colinear = False + self._invert = False + self._distance = 0 + self._minOverlap = 1 + self._pcOverlap = None + self._included = False + self._including = False + self._outputNotOverlapping = False + self._tmpRefFileName = None + self._currentQueryTranscript = None + self._currentOrQueryTranscript = None + self._currentExQueryTranscript = None + self._randInt = random.randint(0, 100000) + + def __del__(self): + for fileName in [self._tmpRefFileName] + self._convertedFileNames.values(): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + + def close(self): + self._iWriter.close() + + def setInput(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._inputFileNames[type] = fileName + self._inputFileFormats[type] = format + + def setOutput(self, outputFileName): + if outputFileName != '': + self._outputFileName = outputFileName + self._iWriter = Gff3Writer(self._outputFileName) + + def setSorted(self, sorted): + self._sorted = sorted + + def setIndex(self, index): + self._index = index + + def restrictToStart(self, distance, type): + self._starts[type] = distance + + def restrictToEnd(self, distance, type): + self._ends[type] = distance + + def extendFivePrime(self, distance, type): + self._fivePrimes[type] = distance + + def extendThreePrime(self, distance, type): + self._threePrimes[type] = distance + + def acceptIntrons(self, boolean): + self._introns = boolean + + def getAntisenseOnly(self, boolean): + self._antisense = boolean + + def getColinearOnly(self, boolean): + self._colinear = boolean + + def getInvert(self, boolean): + self._invert = boolean + + def setMaxDistance(self, distance): + self._distance = distance + + def setMinOverlap(self, overlap): + self._minOverlap = overlap + + def setPcOverlap(self, overlap): + self._pcOverlap = overlap + + def setIncludedOnly(self, boolean): + self._included = boolean + + def setIncludingOnly(self, boolean): + self._including = boolean + + def includeNotOverlapping(self, boolean): + self._outputNotOverlapping = boolean + + def transformTranscript(self, transcript, type): + if self._starts[type] != None: + transcript.restrictStart(self._starts[type]) + if self._ends[type] != None: + transcript.restrictEnd(self._ends[type]) + if self._fivePrimes[type] != None: + transcript.extendStart(self._fivePrimes[type]) + if self._threePrimes[type] != None: + transcript.extendEnd(self._threePrimes[type]) + if self._introns: + transcript.exons = [] + if type == REFERENCE and self._distance > 0: + transcript.extendExons(self._distance) + return transcript + + def extendQueryTranscript(self, transcript): + self._currentExQueryTranscript = Transcript() + self._currentExQueryTranscript.copy(transcript) + if self._fivePrimes[QUERY] != None: + self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY]) + if self._threePrimes[QUERY] != None: + self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY]) + transcript.exons = [] + + def createTmpRefFile(self): + self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt) + if "SMARTTMPPATH" in os.environ: + self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName) + chooser = ParserChooser(self._verbosity) + chooser.findFormat(self._inputFileFormats[REFERENCE]) + parser = chooser.getParser(self._inputFileNames[REFERENCE]) + writer = NCListFilePickle(self._tmpRefFileName, self._verbosity) + for transcript in parser.getIterator(): + transcript = self.transformTranscript(transcript, REFERENCE) + writer.addTranscript(transcript) + writer.close() + self._inputFileNames[REFERENCE] = self._tmpRefFileName + self._inputFileFormats[REFERENCE] = "pkl" + + def createNCLists(self): + self._ncLists = dict([type, {}] for type in TYPES) + self._indices = dict([type, {}] for type in TYPES) + self._cursors = dict([type, {}] for type in TYPES) + for type in TYPES: + if self._verbosity > 2: + print "Creating %s NC-list..." % (TYPETOSTRING[type]) + self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type) + ncLists = ConvertToNCList(self._verbosity) + ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) + ncLists.setOutputFileName(self._convertedFileNames[type]) + ncLists.setSorted(self._sorted) + if type == REFERENCE and self._index: + ncLists.setIndex(True) + ncLists.run() + self._ncListHandlers[type] = NCListHandler(self._verbosity) + self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) + self._ncListHandlers[type].loadData() + self._nbLines[type] = self._ncListHandlers[type].getNbElements() + self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() + self._ncLists[type] = self._ncListHandlers[type].getNCLists() + for chromosome, ncList in self._ncLists[type].iteritems(): + self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) + if type == REFERENCE and self._index: + self._indices[REFERENCE][chromosome] = ncList.getIndex() + if self._verbosity > 2: + print " ...done" + + def compare(self): + nbSkips, nbMoves = 0, 0 + previousChromosome = None + done = False + refNCList = None + queryNCList = None + startTime = time.time() + progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) + for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): + queryParser = self._ncListHandlers[QUERY].getParser(chromosome) + queryNCList = self._ncLists[QUERY][chromosome] + queryCursor = self._cursors[QUERY][chromosome] + if chromosome != previousChromosome: + skipChromosome = False + previousChromosome = chromosome + if chromosome not in self._ncLists[REFERENCE]: + if self._outputNotOverlapping: + while not queryCursor.isOut(): + self._currentQueryTranscript = queryCursor.getTranscript() + self._writeIntervalInNewGFF3({}) + if queryCursor.hasChildren(): + queryCursor.moveDown() + else: + queryCursor.moveNext() + progress.inc() + continue + refNCList = self._ncLists[REFERENCE][chromosome] + refCursor = self._cursors[REFERENCE][chromosome] + while True: + self._currentOrQueryTranscript = queryCursor.getTranscript() + self._currentQueryTranscript = Transcript() + self._currentQueryTranscript.copy(self._currentOrQueryTranscript) + self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY) + self.extendQueryTranscript(self._currentOrQueryTranscript) + newRefLaddr = self.checkIndex(refCursor) + if newRefLaddr != None: + nbMoves += 1 + refCursor.setLIndex(newRefLaddr) + done = False + refCursor, done, unmatched = self.findOverlapIter(refCursor, done) + if refCursor.isOut(): + if not self._invert and not self._outputNotOverlapping: + break + if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren(): + queryCursor.moveNext() + nbSkips += 1 + else: + queryCursor.moveDown() + if queryCursor.isOut(): + break + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + if self._verbosity >= 10: + print "# skips: %d" % (nbSkips) + print "# moves: %d" % (nbMoves) + + def findOverlapIter(self, cursor, done): + chromosome = self._currentQueryTranscript.getChromosome() + matched = False + if chromosome not in self._ncLists[REFERENCE]: + return None, False, True + ncList = self._ncLists[REFERENCE][chromosome] + overlappingNames = {} + nextDone = False + firstOverlapLAddr = NCListCursor(cursor) + firstOverlapLAddr.setLIndex(-1) + if cursor.isOut(): + self._writeIntervalInNewGFF3(overlappingNames) + return firstOverlapLAddr, False, True + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + firstParentAfter = False + while not parentCursor.isOut(): + if self.isOverlapping(parentCursor) == 0: + matched = True + if self._checkOverlap(parentCursor.getTranscript()): + overlappingNames.update(self._extractID(parentCursor.getTranscript())) + if firstOverlapLAddr.isOut(): + firstOverlapLAddr.copy(parentCursor) + nextDone = True + elif self.isOverlapping(parentCursor) == 1: + firstParentAfter = NCListCursor(parentCursor) + parentCursor.moveUp() + if firstParentAfter: + written = self._writeIntervalInNewGFF3(overlappingNames) + return firstParentAfter, False, not written if self._invert else not matched + #This loop finds the overlaps with currentRefLAddr.# + while True: + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + #In case: Query is on the right of the RefInterval and does not overlap. + overlap = self.isOverlapping(cursor) + if overlap == -1: + cursor.moveNext() + #In case: Query overlaps with RefInterval. + elif overlap == 0: + matched = True + if self._checkOverlap(cursor.getTranscript()): + overlappingNames.update(self._extractID(cursor.getTranscript())) + if firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = True + if done: + cursor.moveNext() + else: + if not cursor.hasChildren(): + cursor.moveNext() + if cursor.isOut(): + break + else: + cursor.moveDown() + #In case: Query is on the left of the RefInterval and does not overlap. + else: + if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = False # new + break + + done = False + if cursor.isOut(): + break + written = self._writeIntervalInNewGFF3(overlappingNames) + return firstOverlapLAddr, nextDone, not written if self._invert else not matched + + def isOverlapping(self, refTranscript): + if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()): + return 0 + if self._currentExQueryTranscript.getEnd() < refTranscript.getStart(): + return 1 + return -1 + + def checkIndex(self, cursor): + if not self._index: + return None + if cursor.isOut(): + return None + chromosome = self._currentExQueryTranscript.getChromosome() + nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript) + if nextLIndex == None: + return None + ncList = self._ncLists[REFERENCE][chromosome] + nextGffAddress = ncList.getRefGffAddr(nextLIndex) + thisGffAddress = cursor.getGffAddress() + if nextGffAddress > thisGffAddress: + return nextLIndex + return None + + def _writeIntervalInNewGFF3(self, names): + nbOverlaps = 0 + for cpt in names.values(): + nbOverlaps += cpt + self._nbOverlappingQueries += 1 if Utils.xor(names, self._invert) else 0 + self._nbOverlaps += nbOverlaps if Utils.xor(names, self._invert) else 0 + if names: + self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names)) + self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps) + if self._invert: + return False + else: + if self._outputNotOverlapping: + self._currentQueryTranscript.setTagValue("nbOverlaps", 0) + elif not self._invert: + return False + self._iWriter.addTranscript(self._currentQueryTranscript) + self._iWriter.write() + return True + + def _extractID(self, transcript): + id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName() + nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 + return {id: float(nbElements)} + + def _checkOverlap(self, refTranscript): + if self._currentQueryTranscript.getDistance(refTranscript) > self._distance: + return False + minOverlap = self._minOverlap + if self._pcOverlap != None: + minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap) + if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap): + return False + if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection(): + return False + if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self._included and not refTranscript.include(self._currentQueryTranscript): + return False + if self._including and not self._currentQueryTranscript.include(refTranscript): + return False + if self._introns: + return True + return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap) + + def run(self): + self.createTmpRefFile() + self.createNCLists() + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + print "time: %ds" % (self._timeSpent) + + +if __name__ == "__main__": + description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-D", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") + parser.add_option("-r", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]") + parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") + parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") + parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") + parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") + parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") + parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") + parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") + parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") + parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") + parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + co = CompareOverlapping(options.verbosity) + co.setInput(options.inputFileName1, options.format1, QUERY) + co.setInput(options.inputFileName2, options.format2, REFERENCE) + co.setOutput(options.output) + co.setSorted(options.sorted) + co.setIndex(options.index) + co.restrictToStart(options.start1, QUERY) + co.restrictToStart(options.start2, REFERENCE) + co.restrictToEnd(options.end1, QUERY) + co.restrictToEnd(options.end2, REFERENCE) + co.extendFivePrime(options.fivePrime1, QUERY) + co.extendFivePrime(options.fivePrime2, REFERENCE) + co.extendThreePrime(options.threePrime1, QUERY) + co.extendThreePrime(options.threePrime2, REFERENCE) + co.acceptIntrons(options.introns) + co.getAntisenseOnly(options.antisense) + co.getColinearOnly(options.colinear) + co.getInvert(options.exclude) + co.setMaxDistance(options.distance) + co.setMinOverlap(options.minOverlap) + co.setPcOverlap(options.pcOverlap) + co.setIncludedOnly(options.included) + co.setIncludingOnly(options.including) + co.includeNotOverlapping(options.notOverlapping) + co.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CompareOverlappingSmallQuery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlappingSmallQuery.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,261 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +MINBIN = 3 +MAXBIN = 7 +REFERENCE = 0 +QUERY = 1 + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class CompareOverlappingSmallQuery(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + self.distance = None + self.invert = False + self.antisense = False + self.collinear = False + self.pcOverlapQuery = False + self.pcOverlapRef = False + self.minOverlap = False + self.included = False + self.including = False + self.bins = {} + self.overlaps = {} + self.notOverlapping = False + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.refParser = chooser.getParser(fileName) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setDistance(self, distance): + self.distance = distance + + def setInvert(self, boolean): + self.invert = boolean + + def setCollinear(self, boolean): + self.collinear = boolean + + def setAntisense(self, boolean): + self.antisense = boolean + + def setMinPercentOverlap(self, pcOverlapQuery, pcOverlapRef): + self.pcOverlapQuery = pcOverlapQuery + self.pcOverlapRef = pcOverlapRef + + def setMinOverlap(self, minOverlap): + self.minOverlap = minOverlap + + def setInclude(self, included, including): + self.included = included + self.including = including + + def includeNotOverlapping(self, boolean): + self.notOverlapping = boolean + + def loadQuery(self): + progress = UnlimitedProgress(10000, "Reading queries", self.verbosity) + for transcript in self.queryParser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + bin = getBin(transcript.getStart(), transcript.getEnd()) + if chromosome not in self.bins: + self.bins[chromosome] = {} + if bin not in self.bins[chromosome]: + self.bins[chromosome][bin] = [] + self.bins[chromosome][bin].append(transcript) + if self.notOverlapping or self.invert: + self.overlaps[transcript] = {} + self.nbQueries += 1 + progress.inc() + progress.done() + + def _compareTwoTranscripts(self, queryTranscript, refTranscript): + if not queryTranscript.overlapWithExon(refTranscript): + return False + if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): + return False + if self.included and not refTranscript.include(queryTranscript): + return False + if self.including and not queryTranscript.include(refTranscript): + return False + querySize = queryTranscript.getSize() + if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)): + return False + refSize = refTranscript.getSize() + if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)): + return False + if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap): + return False + return True + + def _alterTranscript(self, transcript, type): + if type == REFERENCE: + if self.distance != None: + transcript.extendExons(self.distance) + return transcript + + def _compareTranscript(self, refTranscript): + refChromosome = refTranscript.getChromosome() + if refChromosome not in self.bins: + return [] + refStart = refTranscript.getStart() + refEnd = refTranscript.getEnd() + bins = getOverlappingBins(refStart, refEnd) + for binRange in bins: + for bin in range(binRange[0], binRange[1]+1): + if bin not in self.bins[refChromosome]: + continue + for queryTranscript in self.bins[refChromosome][bin]: + if self._compareTwoTranscripts(queryTranscript, refTranscript): + if queryTranscript not in self.overlaps: + self.overlaps[queryTranscript] = {} + nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.overlaps[queryTranscript][refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.nbOverlaps += nbElements + + def _updateTranscript(self, queryTranscript): + overlaps = self.overlaps[queryTranscript] + queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) + if overlaps: + queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100]) + return queryTranscript + + def compare(self): + progress = UnlimitedProgress(10000, "Comparing references", self.verbosity) + for refTranscript in self.refParser.getIterator(): + if refTranscript.__class__.__name__ == "Mapping": + refTranscript = refTranscript.getTranscript() + refTranscript = self._alterTranscript(refTranscript, REFERENCE) + self._compareTranscript(refTranscript) + self.nbRefs += 1 + progress.inc() + progress.done() + + def printResults(self): + for transcript in self.overlaps: + if not self.invert or not self.overlaps[transcript]: + if not self.invert: + transcript = self._updateTranscript(transcript) + self.writer.addTranscript(transcript) + self.nbWritten += 1 + self.writer.close() + + def displayResults(self): + if self.verbosity: + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + + def run(self): + self.loadQuery() + self.compare() + self.printResults() + self.displayResults() + +if __name__ == "__main__": + + description = "Compare Overlapping Small Query v1.0.1: Provide the queries that overlap with a reference, when the query is small. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]") + parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]") + parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cosq = CompareOverlappingSmallQuery(options.verbosity) + cosq.setQueryFile(options.inputFileName1, options.format1) + cosq.setReferenceFile(options.inputFileName2, options.format2) + cosq.setOutputFile(options.outputFileName) + cosq.includeNotOverlapping(options.notOverlapping) + cosq.setDistance(options.distance) + cosq.setCollinear(options.collinear) + cosq.setAntisense(options.antisense) + cosq.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef) + cosq.setMinOverlap(options.minOverlap) + cosq.setInclude(options.included, options.including) + cosq.setInvert(options.exclude) + cosq.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CompareOverlappingSmallRef.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlappingSmallRef.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,250 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +MINBIN = 3 +MAXBIN = 7 +REFERENCE = 0 +QUERY = 1 + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class CompareOverlappingSmallRef(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + self.invert = False + self.antisense = False + self.collinear = False + self.distance = None + self.minOverlap = False + self.pcOverlapQuery = False + self.pcOverlapRef = False + self.included = False + self.including = False + self.bins = {} + self.notOverlapping = False + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.refParser = chooser.getParser(fileName) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setDistance(self, distance): + self.distance = distance + + def setCollinear(self, boolean): + self.collinear = boolean + + def setAntisense(self, boolean): + self.antisense = boolean + + def setInvert(self, boolean): + self.invert = boolean + + def setMinPercentOverlap(self, pcOverlapQuery, pcOverlapRef): + self.pcOverlapQuery = pcOverlapQuery + self.pcOverlapRef = pcOverlapRef + + def setInclude(self, included, including): + self.included = included + self.including = including + + def includeNotOverlapping(self, boolean): + self.notOverlapping = boolean + + def loadRef(self): + progress = UnlimitedProgress(10000, "Reading references", self.verbosity) + for transcript in self.refParser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + transcript = self._alterTranscript(transcript, REFERENCE) + chromosome = transcript.getChromosome() + bin = getBin(transcript.getStart(), transcript.getEnd()) + if chromosome not in self.bins: + self.bins[chromosome] = {} + if bin not in self.bins[chromosome]: + self.bins[chromosome][bin] = [] + self.bins[chromosome][bin].append(transcript) + self.nbRefs += 1 + progress.inc() + progress.done() + + def _alterTranscript(self, transcript, type): + if type == REFERENCE: + if self.distance != None: + transcript.extendExons(self.distance) + return transcript + + def _compareTwoTranscripts(self, queryTranscript, refTranscript): + if not queryTranscript.overlapWithExon(refTranscript): + return False + if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): + return False + if self.included and not queryTranscript.isIncluded(refTranscript): + return False + if self.including and not refTranscript.isIncluded(queryTranscript): + return False + querySize = queryTranscript.getSize() + if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)): + return False + refSize = refTranscript.getSize() + if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)): + return False + if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap): + return False + return True + + def _compareTranscript(self, queryTranscript): + queryChromosome = queryTranscript.getChromosome() + if queryChromosome not in self.bins: + return [] + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + bins = getOverlappingBins(queryStart, queryEnd) + overlaps = {} + for binRange in bins: + for bin in range(binRange[0], binRange[1]+1): + if bin not in self.bins[queryChromosome]: + continue + for refTranscript in self.bins[queryChromosome][bin]: + if self._compareTwoTranscripts(queryTranscript, refTranscript): + nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + overlaps[refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.nbOverlaps += nbElements + return overlaps + + def _updateTranscript(self, queryTranscript, overlaps): + queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) + if overlaps: + queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100]) + return queryTranscript + + def compare(self): + progress = UnlimitedProgress(10000, "Comparing queries", self.verbosity) + for queryTranscript in self.queryParser.getIterator(): + if queryTranscript.__class__.__name__ == "Mapping": + queryTranscript = queryTranscript.getTranscript() + progress.inc() + self.nbQueries += 1 + overlaps = self._compareTranscript(queryTranscript) + if self.notOverlapping or (overlaps and not self.invert) or (not overlaps and self.invert): + if not self.invert: + queryTranscript = self._updateTranscript(queryTranscript, overlaps) + self.writer.addTranscript(queryTranscript) + self.nbWritten += 1 + progress.done() + self.writer.close() + + def displayResults(self): + if self.verbosity > 0: + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + + def run(self): + self.loadRef() + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Compare Overlapping Small Reference v1.0.1: Provide the queries that overlap with a reference, when the reference is small. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]") + parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]") + parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cosr = CompareOverlappingSmallRef(options.verbosity) + cosr.setQueryFile(options.inputFileName1, options.format1) + cosr.setReferenceFile(options.inputFileName2, options.format2) + cosr.setOutputFile(options.outputFileName) + cosr.includeNotOverlapping(options.notOverlapping) + cosr.setDistance(options.distance) + cosr.setAntisense(options.antisense) + cosr.setInclude(options.included, options.including) + cosr.setInvert(options.exclude) + cosr.setMinOverlap(options.minOverlap) + cosr.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef) + cosr.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ComputeCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ComputeCoverage.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,142 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, random +from optparse import OptionParser, OptionGroup +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.Gff3Writer import Gff3Writer + + +class CoverageComputer(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.queryReader = None + self.referenceReader = None + self.outputWriter = None + self.introns = False + self.nbNucleotides = 0 + self.nbCovered = 0 + + def setInputQueryFile(self, fileName, format): + self.queryReader = TranscriptContainer(fileName, format, self.verbosity-1) + + def setInputReferenceFile(self, fileName, format): + self.referenceReader = TranscriptContainer(fileName, format, self.verbosity-1) + + def includeIntrons(self, boolean): + self.introns = boolean + + def setOutputFileName(self, fileName, title="S-MART", feature="transcript", featurePart="exon"): + self.outputWriter = Gff3Writer(fileName, self.verbosity-1) + self.outputWriter.setTitle(title) + self.outputWriter.setFeature(feature) + self.outputWriter.setFeaturePart(featurePart) + + def readReference(self): + self.coveredRegions = {} + progress = Progress(self.referenceReader.getNbTranscripts(), "Reading reference file", self.verbosity-1) + for transcript in self.referenceReader.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + self.coveredRegions[chromosome] = {} + if self.introns: + transcript.removeExons() + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.coveredRegions[chromosome][position] = 1 + progress.inc() + progress.done() + + def readQuery(self): + progress = Progress(self.queryReader.getNbTranscripts(), "Reading query file", self.verbosity-1) + for transcript in self.queryReader.getIterator(): + progress.inc() + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + continue + if self.introns: + transcript.removeExons() + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.nbNucleotides += 1 + self.nbCovered += self.coveredRegions[chromosome].get(position, 0) + progress.done() + + def write(self): + progress = Progress(self.queryReader.getNbTranscripts(), "Writing output file", self.verbosity-1) + for transcript in self.queryReader.getIterator(): + chromosome = transcript.getChromosome() + if self.introns: + transcript.removeExons() + size = transcript.getSize() + coverage = 0 + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + coverage += self.coveredRegions[chromosome].get(position, 0) + transcript.setTagValue("coverage", 0 if size == 0 else float(coverage) / size * 100) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + + def sumUp(self): + print "%d nucleotides in query, %d (%.f%%) covered" % (self.nbNucleotides, self.nbCovered, 0 if self.nbNucleotides == 0 else float(self.nbCovered) / self.nbNucleotides * 100) + + def run(self): + self.readReference() + self.readQuery() + if self.outputWriter != None: + self.write() + self.sumUp() + + +if __name__ == "__main__": + + # parse command line + description = "Compute Coverage v1.0.1: Compute the coverage of a set with respect to another set. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input query file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of the first file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input reference file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of the second file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--introns", dest="introns", action="store_true", default=False, help="also include introns [format: boolean] [default: false]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + computer = CoverageComputer(options.verbosity) + computer.setInputQueryFile(options.inputFileName1, options.format1) + computer.setInputReferenceFile(options.inputFileName2, options.format2) + computer.includeIntrons(options.introns) + computer.setOutputFileName(options.outputFileName) + computer.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CountLoci.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CountLoci.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,230 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path, random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.cleanGff import CleanGff +from SMART.Java.Python.CompareOverlappingSmallRef import CompareOverlappingSmallRef +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.GetUpDownStream import GetUpDownStream + +REFERENCE = 0 +QUERY = 1 + +class CountLoci(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.tmpFileNames = [] + + def __del__(self): + for fileName in self.tmpFileNames: + if os.path.exists(fileName): + os.remove(fileName) + + def setInputFile(self, fileName, format): + self.inputFileName = fileName + self.inputFormat = format + self.parser = TranscriptContainer(fileName, format, self.verbosity-1) + if self.verbosity > 0: + print "%d elements in input" % (self.parser.getNbTranscripts()) + + def setReference(self, fileName): + self.referenceFileName = fileName + + def setDistance(self, distance): + self.distance = distance + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + self.writer = Gff3Writer(fileName, self.verbosity-1) + self.outputBase = "%s_%d_" % (os.path.splitext(fileName)[0], random.randint(0, 10000)) + + def _writeTmpRef(self, tags, outputFileName): + cleanGff = CleanGff(self.verbosity-1) + cleanGff.setInputFileName(self.referenceFileName) + cleanGff.setOutputFileName(outputFileName) + cleanGff.setAcceptedTypes(tags) + cleanGff.run() + + def _getReferenceFiles(self): + self.referenceFiles = {"CDS": "%scds.gff3" % (self.outputBase), \ + "five_prime_UTR": "%sfive.gff3" % (self.outputBase), \ + "three_prime_UTR": "%sthree.gff3" % (self.outputBase), \ + "mRNA": "%smrna.gff3" % (self.outputBase), \ + "ncRNA": "%sncRNA.gff3" % (self.outputBase), \ + "transposable_element_gene": "%sTE.gff3" % (self.outputBase), \ + "vic": "%svicinity.gff3" % (self.outputBase)} + self.tmpFileNames.extend(self.referenceFiles.values()) + for tag, fileName in self.referenceFiles.iteritems(): + if tag == "ncRNA": + self._writeTmpRef(["miRNA", "ncRNA", "rRNA", "snoRNA", "snRNA", "tRNA"], fileName) + elif tag == "vic": + continue + else: + self._writeTmpRef([tag], fileName) + + def _compare(self, queryFileName, queryFormat, referenceFileName, referenceFormat, outputFileName, exclusion = False): + co = CompareOverlappingSmallRef(self.verbosity-1) + co.setQueryFile(queryFileName, queryFormat) + co.setReferenceFile(referenceFileName, referenceFormat) + co.setOutputFile(outputFileName) + if exclusion: + co.setInvert(True) + co.run() + return co.nbWritten + + def _copy(self, inputFile, tag): + parser = GffParser(inputFile, self.verbosity-1) + for transcript in parser.getIterator(): + transcript.setTagValue("locus", tag) + self.writer.addTranscript(transcript) + + def _getCds(self): + outputFileName = "%sin_cds.gff3" % (self.outputBase) + outputNoFileName = "%sin_nocds.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputFileName) + self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "CDS") + if self.verbosity > 0: + print "%d overlaps in CDS" % (nbOverlaps) + return outputNoFileName + + def _getFivePrime(self, inputFileName): + outputFileName = "%sin_five.gff3" % (self.outputBase) + outputNoFileName = "%sin_nofive.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "five_prime_UTR") + if self.verbosity > 0: + print "%d overlaps in 5' UTR" % (nbOverlaps) + return outputNoFileName + + def _getThreePrime(self, inputFileName): + outputFileName = "%sin_three.gff3" % (self.outputBase) + outputNoFileName = "%sin_nothree.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "three_prime_UTR") + if self.verbosity > 0: + print "%d overlaps in 3' UTR" % (nbOverlaps) + return outputNoFileName + + def _getNcRna(self, inputFileName): + outputFileName = "%sin_ncRna.gff3" % (self.outputBase) + outputNoFileName = "%sin_noNcRna.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "ncRNA") + if self.verbosity > 0: + print "%d overlaps in ncRNA" % (nbOverlaps) + return outputNoFileName + + def _getTe(self, inputFileName): + outputFileName = "%sin_te.gff3" % (self.outputBase) + outputNoFileName = "%sin_noTe.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "TE") + if self.verbosity > 0: + print "%d overlaps in TE" % (nbOverlaps) + return outputNoFileName + + def _getIntron(self, inputFileName): + outputFileName = "%sin_intron.gff3" % (self.outputBase) + outputNoFileName = "%sin_nointron.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "intron") + if self.verbosity > 0: + print "%d overlaps in introns" % (nbOverlaps) + return outputNoFileName + + def _getVicinity(self, inputFileName): + guds = GetUpDownStream(self.verbosity-1) + guds.setInputFile(self.referenceFiles["mRNA"], "gff3") + guds.setOutputFile(self.referenceFiles["vic"]) + guds.setDistances(self.distance, self.distance) + guds.run() + outputFileName = "%sout_vicinity.gff3" % (self.outputBase) + outputNoFileName = "%sout_novicinity.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputFileName) + nbNoOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "vicinity") + self._copy(outputNoFileName, "intergenic") + if self.verbosity > 0: + print "%d overlaps in vicinity" % (nbOverlaps) + print "%d elsewhere" % (nbNoOverlaps) + + def run(self): + self._getReferenceFiles() + outputFileName = self._getCds() + outputFileName = self._getFivePrime(outputFileName) + outputFileName = self._getThreePrime(outputFileName) + outputFileName = self._getNcRna(outputFileName) + outputFileName = self._getTe(outputFileName) + outputFileName = self._getIntron(outputFileName) + self._getVicinity(outputFileName) + + + +if __name__ == "__main__": + + # parse command line + description = "Count Loci v1.0.0: Count input elements with respect to CDS, 5' UTR, 3' UTR, intron, downstream, upstream. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-r", "--reference", dest="reference", action="store", type="string", help="reference file [compulsory] [format: file in GFF format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-d", "--distance", dest="distance", action="store", type="int", help="distance up/down stream [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cl = CountLoci(options.verbosity) + cl.setInputFile(options.inputFileName, options.format) + cl.setDistance(options.distance) + cl.setReference(options.reference) + cl.setOutputFileName(options.outputFileName) + cl.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/CountReadGCPercent.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CountReadGCPercent.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.utils.RepetOptionParser import RepetOptionParser +from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow + + +class CountReadGCPercent(object): + + def __init__(self): + self.referenceReader = None + self.gffReader = None + self.outputWriter = None + self.verbose = 0 + + def setInputReferenceFile(self, fileName): + self.referenceReader = fileName + + def setInputGffFile(self, fileName): + self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose) + + def setOutputFileName(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbose) + + def readGffAnnotation(self): + self.coveredRegions = {} + progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose) + for transcript in self.gffReader.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + self.coveredRegions[chromosome] = {} + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.coveredRegions[chromosome][position] = 1 + progress.inc() + progress.done() + + def write(self): + iParser = FastaParser(self.referenceReader) + iParser.setTags() + iGetGCPercentBySW = CountGCPercentBySlidingWindow() + progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose) + for transcript in self.gffReader.getIterator(): + chromosome = transcript.getChromosome() + GCpercent = 0 + nPercent = 0 + for exon in transcript.getExons(): + for sequenceName in iParser.getTags().keys(): + if sequenceName != chromosome: + continue + else: + subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1) + GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence) + print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent) + transcript.setTagValue("GCpercent", GCpercent) + transcript.setTagValue("NPercent", nPercent) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + + def run(self): + self.readGffAnnotation() + if self.outputWriter != None: + self.write() + +if __name__ == "__main__": + description = "Count GC percent for each read against a genome." + usage = "CountReadGCPercent.py -i -j -o -v -h]" + examples = "\nExample: \n" + examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3" + examples += "\n\n" + parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) + parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None ) + parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None) + parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None ) + parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 ) + (options, args) = parser.parse_args() + + readGCPercent = CountReadGCPercent() + readGCPercent.setInputReferenceFile(options.fastaFile) + readGCPercent.setInputGffFile(options.gffFile) + readGCPercent.setOutputFileName(options.outputFile) + readGCPercent.run() + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/FindOverlapsOptim.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/FindOverlapsOptim.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,343 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os, struct, time, shutil +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList +from SMART.Java.Python.ncList.NCListParser import NCListParser +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCListHandler import NCListHandler +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class FindOverlapsOptim(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._sortedFileNames = {} + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._inputFileNames = {REFERENCE: None, QUERY: None} + self._convertedFileNames = {REFERENCE: False, QUERY: False} + self._inputFileFormats = {REFERENCE: None, QUERY: None} + self._converted = {REFERENCE: False, QUERY: False} + self._ncListHandlers = {REFERENCE: None, QUERY: None} + self._splittedFileNames = {REFERENCE: {}, QUERY: {}} + self._nbOverlappingQueries = 0 + self._nbOverlaps = 0 + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._sorted = False + self._index = False + self._verbosity = verbosity + self._ncLists = {} + self._cursors = {} + self._nbElementsPerChromosome = {} + self._tmpDirectories = {REFERENCE: False, QUERY: False} + + def close(self): + self._iWriter.close() + for fileName in (self._sortedFileNames.values()): + if os.path.exists(fileName): + os.remove(fileName) + for fileName in self._convertedFileNames.values(): + if fileName: + os.remove(fileName) + + def setRefFileName(self, fileName, format): + self.setFileName(fileName, format, REFERENCE) + + def setQueryFileName(self, fileName, format): + self.setFileName(fileName, format, QUERY) + + def setFileName(self, fileName, format, type): + self._inputFileNames[type] = fileName + self._inputFileFormats[type] = format + if format.lower() != "nclist": + self._converted[type] = True + + def setOutputFileName(self, outputFileName): + self._outputFileName = outputFileName + self._iWriter = Gff3Writer(self._outputFileName) + + def setSorted(self, sorted): + self._sorted = sorted + + def setIndex(self, index): + self._index = index + + def createNCLists(self): + startTime = time.time() + if self._verbosity > 1: + print "Building database" + self._ncLists = dict([type, {}] for type in TYPES) + self._indices = dict([type, {}] for type in TYPES) + self._cursors = dict([type, {}] for type in TYPES) + for type in TYPES: + self._ncListHandlers[type] = NCListHandler(self._verbosity-3) + if self._converted[type]: + self._convertedFileNames[type] = "%s_%d.ncl" % (os.path.splitext(self._inputFileNames[type])[0], type) + ncLists = ConvertToNCList(self._verbosity-3) + ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) + ncLists.setSorted(self._sorted) + ncLists.setOutputFileName(self._convertedFileNames[type]) + if type == REFERENCE and self._index: + ncLists.setIndex(True) + ncLists.run() + self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) + else: + self._ncListHandlers[type].setFileName(self._inputFileNames[type]) + self._ncListHandlers[type].loadData() + self._nbLines[type] = self._ncListHandlers[type].getNbElements() + self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() + self._ncLists[type] = self._ncListHandlers[type].getNCLists() + for chromosome, ncList in self._ncLists[type].iteritems(): + self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) + if type == REFERENCE and self._index: + self._indices[REFERENCE][chromosome] = ncList.getIndex() + endTime = time.time() + if self._verbosity > 1: + print "done (%.2gs)" % (endTime - startTime) + + def compare(self): + nbSkips, nbMoves = 0, 0 + previousChromosome = None + done = False + startTime = time.time() + progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) + #print "query:", self._ncLists[QUERY].keys() + #print "reference:", self._ncLists[REFERENCE].keys() + for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): + queryParser = self._ncListHandlers[QUERY].getParser(chromosome) + queryCursor = self._cursors[QUERY][chromosome] + if chromosome != previousChromosome: + skipChromosome = False + previousChromosome = chromosome + if chromosome not in self._ncLists[REFERENCE]: + #print "out ", chromosome + continue + refNCList = self._ncLists[REFERENCE][chromosome] + refCursor = self._cursors[REFERENCE][chromosome] + #print "starting", chromosome + while True: + queryTranscript = queryCursor.getTranscript() + newRefLaddr = self.checkIndex(queryTranscript, refCursor) + #print "query is", queryTranscript + if newRefLaddr != None: + nbMoves += 1 + refCursor.setLIndex(newRefLaddr) + #print "skipping to", refCursor + done = False + refCursor, done, unmatched = self.findOverlapIter(queryTranscript, refCursor, done) + #print "completed with", refCursor, done, unmatched + if refCursor.isOut(): + #print "exiting 1", chromosome + break + if unmatched or not queryCursor.hasChildren(): + queryCursor.moveNext() + #print "moving next to", queryCursor + nbSkips += 1 + else: + queryCursor.moveDown() + #print "moving down to", queryCursor + if queryCursor.isOut(): + #print "exiting 2", chromosome + break + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + if self._verbosity >= 10: + print "# skips: %d" % (nbSkips) + print "# moves: %d" % (nbMoves) + + def findOverlapIter(self, queryTranscript, cursor, done): + chromosome = queryTranscript.getChromosome() + if chromosome not in self._ncLists[REFERENCE]: + return False, None + ncList = self._ncLists[REFERENCE][chromosome] + overlappingNames = {} + nextDone = False + firstOverlapLAddr = NCListCursor(cursor) + firstOverlapLAddr.setLIndex(-1) + if cursor.isOut(): + return firstOverlapLAddr, False + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + firstParentAfter = False + #print "query transcript 1", queryTranscript + #print "cursor 1", cursor + #print "parent 1", parentCursor + while not parentCursor.isOut(): + if self.isOverlapping(queryTranscript, parentCursor) == 0: + #print "overlap parent choice 0" + overlappingNames.update(self._extractID(parentCursor.getTranscript())) + if firstOverlapLAddr.isOut(): + #print "overlap parent 2" + firstOverlapLAddr.copy(parentCursor) + nextDone = True # new + elif self.isOverlapping(queryTranscript, parentCursor) == 1: + #print "overlap parent choice 1" + firstParentAfter = NCListCursor(parentCursor) + parentCursor.moveUp() + #print "parent 2", parentCursor + if firstParentAfter: + #print "exit parent", firstParentAfter, overlappingNames + self._writeIntervalInNewGFF3(queryTranscript, overlappingNames) + return firstParentAfter, False, not overlappingNames + #This loop finds the overlaps with currentRefLAddr.# + while True: + #print "ref cursor now is", cursor + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + #In case: Query is on the right of the RefInterval and does not overlap. + overlap = self.isOverlapping(queryTranscript, cursor) + if overlap == -1: + cursor.moveNext() + #In case: Query overlaps with RefInterval. + elif overlap == 0: + #print "choice 2" + overlappingNames.update(self._extractID(cursor.getTranscript())) + if firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = True # new + if done: + cursor.moveNext() + else: + if not cursor.hasChildren(): + cursor.moveNext() + if cursor.isOut(): + #print "break 1" + break + else: + cursor.moveDown() + #In case: Query is on the left of the RefInterval and does not overlap. + else: + #print "choice 3" + if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): + #print "changing nfo 2" + firstOverlapLAddr.copy(cursor) + nextDone = False # new + #print "break 2" + break + + done = False + if cursor.isOut(): + #print "break 3" + break + self._writeIntervalInNewGFF3(queryTranscript, overlappingNames) + return firstOverlapLAddr, nextDone, not overlappingNames + + def isOverlapping(self, queryTranscript, refTranscript): + if (queryTranscript.getStart() <= refTranscript.getEnd() and queryTranscript.getEnd() >= refTranscript.getStart()): + return 0 + if queryTranscript.getEnd() < refTranscript.getStart(): + return 1 + return -1 + + def checkIndex(self, transcript, cursor): + if not self._index: + return None + chromosome = transcript.getChromosome() + nextLIndex = self._indices[REFERENCE][chromosome].getIndex(transcript) + if nextLIndex == None: + return None + ncList = self._ncLists[REFERENCE][chromosome] + nextGffAddress = ncList.getRefGffAddr(nextLIndex) + thisGffAddress = cursor.getGffAddress() + if nextGffAddress > thisGffAddress: + return nextLIndex + return None + + def _writeIntervalInNewGFF3(self, transcript, names): + nbOverlaps = 0 + for cpt in names.values(): + nbOverlaps += cpt + if not names: + return + transcript.setTagValue("overlapsWith", "--".join(sorted(names.keys()))) + transcript.setTagValue("nbOverlaps", nbOverlaps) + self._iWriter.addTranscript(transcript) + self._iWriter.write() + self._nbOverlappingQueries += 1 + self._nbOverlaps += nbOverlaps + + def _extractID(self, transcript): + nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 + id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName() + return {id: nbElements} + + def run(self): + self.createNCLists() + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + print "time: %.2gs" % (self._timeSpent) + + +if __name__ == "__main__": + description = "Find Overlaps Optim v1.0.0: Finds overlaps with several query intervals. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript or other format given by -f]") + parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]") + parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="reference input file [compulsory] [format: file in transcript or other format given by -g]") + parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-d", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFOO = FindOverlapsOptim(options.verbosity) + iFOO.setRefFileName(options.inputRefFileName, options.refFormat) + iFOO.setQueryFileName(options.inputQueryFileName, options.queryFormat) + iFOO.setOutputFileName(options.outputFileName) + iFOO.setIndex(options.index) + iFOO.setSorted(options.sorted) + iFOO.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetDifferentialExpression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetDifferentialExpression.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,441 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the differential expression between 2 conditions (2 files), on regions defined by a third file""" + +import os, re +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.structure.Transcript import Transcript + +class GetDifferentialExpression(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(verbosity) + self.inputs = (0, 1) + self.transcriptContainers = [None, None] + self.transcriptContainerRef = None + self.outputFileName = None + self.writer = None + self.tables = [None, None] + self.nbElements = [0, 0] + + self.regionsToValues = {} + self.regionsToNames = {} + self.valuesToPvalues = {} + + self.oriented = True + self.simpleNormalization = False + self.simpleNormalizationParameters = None + self.adjustedNormalization = False + self.fixedSizeFactor = None + self.normalizationSize = None + self.normalizationFactors = [1, 1] + self.fdr = None + self.fdrPvalue = None + + self.plot = False + self.plotter = None + self.plotterName = None + self.points = {} + + + def setInputFile(self, i, fileName, fileFormat): + self.transcriptContainers[i] = TranscriptContainer(fileName, fileFormat, self.verbosity) + self.transcriptContainers[i].mySqlConnection = self.mySqlConnection + + + def setReferenceFile(self, fileName, fileFormat): + self.transcriptContainerRef = TranscriptContainer(fileName, fileFormat, self.verbosity) + self.transcriptContainerRef.mySqlConnection = self.mySqlConnection + + + def setOutputFile(self, fileName): + self.outputFileName = fileName + self.writer = Gff3Writer(fileName, self.verbosity) + + + def setOriented(self, boolean): + self.oriented = boolean + + + def setSimpleNormalization(self, boolean): + self.simpleNormalization = boolean + + + def setSimpleNormalizationParameters(self, parameters): + if parameters != None: + self.simpleNormalization = True + self.simpleNormalizationParameters = [0, 0] + for i, splittedParameter in enumerate(parameters.split(",")): + self.simpleNormalizationParameters[i] = int(splittedParameter) + + + def setAdjustedNormalization(self, boolean): + self.adjustedNormalization = boolean + + + def setFixedSizeNormalization(self, value): + self.fixedSizeFactor = value + + + def setFdr(self, fdr): + self.fdr = fdr + + + def setPlot(self, boolean): + self.plot = boolean + + + def setPlotterName(self, plotterName): + self.plotterName = plotterName + + def setPlotter(self): + self.plot = True + self.plotter = RPlotter(self.plotterName, self.verbosity) + self.plotter.setPoints(True) + self.plotter.setLog("xy") + self.points = {} + + + def readInput(self, i): + self.transcriptContainers[i].storeIntoDatabase() + self.tables[i] = self.transcriptContainers[i].getTables() + progress = Progress(len(self.tables[i].keys()), "Adding indices", self.verbosity) + for chromosome in self.tables[i]: + if self.oriented: + self.tables[i][chromosome].createIndex("iStartEndDir_%s_%d" % (chromosome, i), ("start", "end", "direction")) + else: + self.tables[i][chromosome].createIndex("iStartEnd_%s_%d" % (chromosome, i), ("start", "end")) + progress.inc() + progress.done() + + progress = Progress(self.transcriptContainers[i].getNbTranscripts(), "Reading sample %d" % (i +1), self.verbosity) + for chromosome in self.tables[i]: + for transcript in self.tables[i][chromosome].getIterator(): + self.nbElements[i] += 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements") + progress.inc() + progress.done() + if self.verbosity > 0: + print "%d elements in sample %d" % (self.nbElements[i], i+1) + + + def computeSimpleNormalizationFactors(self): + nbElements = self.nbElements + if self.simpleNormalizationParameters != None: + print "Using provided normalization parameters: %s" % (", ".join([str(parameter) for parameter in self.simpleNormalizationParameters])) + nbElements = self.simpleNormalizationParameters + avgNbElements = int(float(sum(nbElements)) / len(nbElements)) + for i in self.inputs: + self.normalizationFactors[i] = float(avgNbElements) / nbElements[i] + self.nbElements[i] *= self.normalizationFactors[i] + if self.verbosity > 1: + print "Normalizing to average # reads: %d" % (avgNbElements) + if self.simpleNormalizationParameters != None: + print "# reads: %s" % (", ".join([str(nbElement) for nbElement in self.nbElements])) + + def __del__(self): + self.mySqlConnection.deleteDatabase() + + def regionToString(self, transcript): + return "%s:%d-%d(%s)" % (transcript.getChromosome(), transcript.getStart(), transcript.getEnd(), "+" if transcript.getDirection() == 1 else "-") + + def stringToRegion(self, region): + m = re.search(r"^(\S+):(\d+)-(\d+)\((\S)\)$", region) + if m == None: + raise Exception("Internal format error: cannot parse region '%s'" % (region)) + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(int(m.group(2))) + transcript.setEnd(int(m.group(3))) + transcript.setDirection(m.group(4)) + return transcript + + def computeMinimumSize(self): + self.normalizationSize = 1000000000 + progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Getting minimum reference size", self.verbosity) + for transcriptRef in self.transcriptContainerRef.getIterator(): + self.normalizationSize = min(self.normalizationSize, transcriptRef.getEnd() - transcriptRef.getStart()) + progress.inc() + progress.done() + if self.verbosity > 1: + print "Minimum reference size: %d" % (self.normalizationSize+1) + + def useFixedSizeNormalization(self, start, end, starts): + currentNb = 0 + sum = 0 + if not starts: + return 0 + for i in range(start - self.normalizationSize, end + 1 + self.normalizationSize): + if i not in starts: + starts[i] = 0 + for i, s in starts.iteritems(): + if i < start: + starts[start] += s + starts[i] = 0 + for i in range(start - self.normalizationSize, end + 1): + currentNb += starts[i+self.normalizationSize] - starts[i] + sum += currentNb + return (float(sum) / self.normalizationSize) * (self.fixedSizeFactor / (end - start + 1)) + + def retrieveCounts(self, transcriptRef, i): + if transcriptRef.getChromosome() not in self.tables[i]: + return (0, 0) + cumulatedCount = 0 + cumulatedNormalizedCount = 0 + for exon in transcriptRef.getExons(): + count = 0 + starts = {} + command = "SELECT start, tags FROM '%s' WHERE start >= %d AND end <= %d" % (self.tables[i][exon.getChromosome()].getName(), exon.getStart(), exon.getEnd()) + if self.oriented: + command += " AND direction = %d" % (exon.getDirection()) + query = self.mySqlConnection.executeQuery(command) + for line in query.getIterator(): + nb = 1 + tags = line[1].split(";") + for tag in tags: + key, value = tag.split("=") + if key == "nbElements": + nb = int(float(value)) + count += nb + starts[int(line[0])] = nb + normalizedCount = count if self.fixedSizeFactor == None else self.useFixedSizeNormalization(exon.getStart(), exon.getEnd(), starts) + cumulatedCount += count + cumulatedNormalizedCount += normalizedCount + return (cumulatedCount, cumulatedNormalizedCount) + + def getAllCounts(self): + progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Getting counts", self.verbosity) + for cpt, transcriptRef in enumerate(self.transcriptContainerRef.getIterator()): + if "ID" in transcriptRef.getTagNames(): + self.regionsToNames[self.regionToString(transcriptRef)] = transcriptRef.getTagValue("ID") + elif transcriptRef.getName() != None: + self.regionsToNames[self.regionToString(transcriptRef)] = transcriptRef.getName() + else: + self.regionsToNames[self.regionToString(transcriptRef)] = "region_%d" % (cpt) + values = [None, None] + normalizedValues = [None, None] + for i in self.inputs: + values[i], normalizedValues[i] = self.retrieveCounts(transcriptRef, i) + normalizedValues[i] = int(self.normalizationFactors[i] * normalizedValues[i]) + if sum(values) != 0: + self.regionsToValues[self.regionToString(transcriptRef)] = (normalizedValues[0], normalizedValues[1], values[0], values[1]) + progress.inc() + progress.done() + + def computeAdjustedNormalizationFactors(self): + nbElements = len(self.regionsToValues.keys()) + avgValues = [] + progress = Progress(nbElements, "Normalization step 1", self.verbosity) + for values in self.regionsToValues.values(): + correctedValues = [values[i] * self.normalizationFactors[i] for i in self.inputs] + avgValues.append(float(sum(correctedValues)) / len(correctedValues)) + progress.inc() + progress.done() + + sortedAvgValues = sorted(avgValues) + minAvgValues = sortedAvgValues[nbElements / 4] + maxAvgValues = sortedAvgValues[nbElements * 3 / 4] + sums = [0, 0] + progress = Progress(nbElements, "Normalization step 2", self.verbosity) + for values in self.regionsToValues.values(): + correctedValues = [values[i] * self.normalizationFactors[i] for i in self.inputs] + avgValue = float(sum(correctedValues)) / len(correctedValues) + if minAvgValues <= avgValue and avgValue <= maxAvgValues: + for i in self.inputs: + sums[i] += values[i] + progress.inc() + progress.done() + + avgSums = float(sum(sums)) / len(sums) + for i in self.inputs: + if self.verbosity > 1: + print "Normalizing sample %d: %s to" % ((i+1), self.nbElements[i]), + self.normalizationFactors[i] *= float(avgSums) / sums[i] + self.nbElements[i] *= self.normalizationFactors[i] + if self.verbosity > 1: + print "%s" % (int(self.nbElements[i])) + + def getMinimumReferenceSize(self): + self.normalizationSize = 1000000000 + progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Reference element sizes", self.verbosity) + for transcriptRef in self.transcriptContainerRef.getIterator(): + self.normalizationSize = min(self.normalizationSize, transcriptRef.getEnd() - transcriptRef.getStart() + 1) + progress.inc() + progress.done() + if self.verbosity > 1: + print "Minimum reference size: %d" % (self.normalizationSize) + + def computePvalues(self): + normalizedValues = set() + progress = Progress(len(self.regionsToValues.keys()), "Normalizing counts", self.verbosity) + for region in self.regionsToValues: + values = self.regionsToValues[region] + normalizedValues0 = int(round(values[0] * self.normalizationFactors[0])) + normalizedValues1 = int(round(values[1] * self.normalizationFactors[1])) + self.regionsToValues[region] = (normalizedValues0, normalizedValues1, self.regionsToValues[region][2], self.regionsToValues[region][3]) + normalizedValues.add((normalizedValues0, normalizedValues1, self.nbElements[0] - normalizedValues0, self.nbElements[1] - normalizedValues1, self.regionsToValues[region][2], self.regionsToValues[region][3])) + progress.inc() + progress.done() + + if self.verbosity > 1: + print "Computing p-values..." + self.valuesToPvalues = Utils.fisherExactPValueBulk(list(normalizedValues)) + if self.verbosity > 1: + print "... done" + + def setTagValues(self, transcript, values, pValue): + for tag in transcript.getTagNames(): + transcript.deleteTag(tag) + transcript.removeExons() + transcript.setTagValue("pValue", str(pValue)) + transcript.setTagValue("nbReadsCond1", str(values[0])) + transcript.setTagValue("nbReadsCond2", str(values[1])) + transcript.setTagValue("nbUnnormalizedReadsCond1", str(values[2])) + transcript.setTagValue("nbUnnormalizedReadsCond2", str(values[3])) + if (values[0] == values[1]) or (self.fdr != None and pValue > self.fdrPvalue): + transcript.setTagValue("regulation", "equal") + elif values[0] < values[1]: + transcript.setTagValue("regulation", "up") + else: + transcript.setTagValue("regulation", "down") + return transcript + + def computeFdr(self): + pValues = [] + nbRegions = len(self.regionsToValues.keys()) + progress = Progress(nbRegions, "Computing FDR", self.verbosity) + for values in self.regionsToValues.values(): + pValues.append(self.valuesToPvalues[values[0:2]]) + progress.inc() + progress.done() + + for i, pValue in enumerate(reversed(sorted(pValues))): + if pValue <= self.fdr * (nbRegions - 1 - i) / nbRegions: + self.fdrPvalue = pValue + if self.verbosity > 1: + print "FDR: %f, k: %i, m: %d" % (pValue, nbRegions - 1 - i, nbRegions) + return + + def writeDifferentialExpression(self): + if self.plot: + self.setPlotter() + + cpt = 1 + progress = Progress(len(self.regionsToValues.keys()), "Writing output", self.verbosity) + for region, values in self.regionsToValues.iteritems(): + transcript = self.stringToRegion(region) + pValue = self.valuesToPvalues[values[0:2]] + transcript.setName(self.regionsToNames[region]) + transcript = self.setTagValues(transcript, values, pValue) + self.writer.addTranscript(transcript) + cpt += 1 + + if self.plot: + self.points[region] = (values[0], values[1]) + progress.done() + self.writer.write() + self.writer.close() + + if self.plot: + self.plotter.addLine(self.points) + self.plotter.plot() + + def getDifferentialExpression(self): + for i in self.inputs: + self.readInput(i) + + if self.simpleNormalization: + self.computeSimpleNormalizationFactors() + if self.fixedSizeFactor != None: + self.computeMinimumSize() + + self.getAllCounts() + + if self.adjustedNormalization: + self.computeAdjustedNormalizationFactors() + + self.computePvalues() + + if self.fdr != None: + self.computeFdr() + + self.writeDifferentialExpression() + + +if __name__ == "__main__": + + # parse command line + description = "Get Differential Expression v1.0.1: Get the differential expression between 2 conditions using Fisher's exact test, on regions defined by a third file. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-k", "--reference", dest="referenceFileName", action="store", type="string", help="reference file [compulsory] [format: file in transcript format given by -l]") + parser.add_option("-l", "--referenceFormat", dest="referenceFormat", action="store", type="string", help="format of reference file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in gff3 format]") + parser.add_option("-n", "--notOriented", dest="notOriented", action="store_true", default=False, help="if the reads are not oriented [default: False] [format: bool]") + parser.add_option("-s", "--simple", dest="simple", action="store_true", default=False, help="normalize using the number of reads in each condition [format: bool]") + parser.add_option("-S", "--simpleParameters", dest="simpleParameters", action="store", default=None, type="string", help="provide the number of reads [format: bool]") + parser.add_option("-a", "--adjusted", dest="adjusted", action="store_true", default=False, help="normalize using the number of reads of 'mean' regions [format: bool]") + parser.add_option("-x", "--fixedSizeFactor", dest="fixedSizeFactor", action="store", default=None, type="int", help="give the magnification factor for the normalization using fixed size sliding windows in reference regions (leave empty for no such normalization) [format: int]") + parser.add_option("-d", "--fdr", dest="fdr", action="store", default=None, type="float", help="use FDR [format: float]") + parser.add_option("-p", "--plot", dest="plotName", action="store", default=None, type="string", help="plot cloud plot [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + + + differentialExpression = GetDifferentialExpression(options.verbosity) + differentialExpression.setInputFile(0, options.inputFileName1, options.format1) + differentialExpression.setInputFile(1, options.inputFileName2, options.format2) + differentialExpression.setReferenceFile(options.referenceFileName, options.referenceFormat) + differentialExpression.setOutputFile(options.outputFileName) + if options.plotName != None : + differentialExpression.setPlotterName(options.plotName) + differentialExpression.setPlotter() + differentialExpression.setOriented(not options.notOriented) + differentialExpression.setSimpleNormalization(options.simple) + differentialExpression.setSimpleNormalizationParameters(options.simpleParameters) + differentialExpression.setAdjustedNormalization(options.adjusted) + differentialExpression.setFixedSizeNormalization(options.fixedSizeFactor) + differentialExpression.setFdr(options.fdr) + differentialExpression.getDifferentialExpression() + differentialExpression.mySqlConnection.deleteDatabase() + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,362 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.MultipleRPlotter import MultipleRPlotter +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress + +TWOSTRANDS = {True: [1, -1], False: [0]} +STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""} + +class GetDistribution(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.sizes = None + self.twoStrands = False + self.start = 1 + self.names = ["nbElements"] + self.average = False + self.nbValues = {} + self.height = 300 + self.width = 600 + self.colors = None + self.gffFileName = None + self.csvFileName = None + self.yMin = None + self.yMax = None + self.chromosome = None + self.merge = False + self.nbTranscripts = None + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setReferenceFile(self, fileName): + if fileName == None: + return + fastaParser = FastaParser(fileName, self.verbosity) + self.chromosomes = fastaParser.getRegions() + self.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes) + self.maxSize = max(self.sizes.values()) + + def setRegion(self, chromosome, start, end): + if chromosome == None: + return + self.maxSize = options.end + self.sizes = {chromosome: end} + self.chromosomes = [chromosome] + self.chromosome = chromosome + self.start = start + self.end = end + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + def setNbBins(self, nbBins): + self.nbBins = nbBins + + def set2Strands(self, twoStrands): + self.twoStrands = twoStrands + + def setNames(self, names): + self.names = names + + def setAverage(self, average): + self.average = average + + def setNormalization(self, normalization): + self.normalization = normalization + + def setImageSize(self, height, width): + self.height = height + self.width = width + + def setYLimits(self, yMin, yMax): + self.yMin = yMin + self.yMax = yMax + + def setColors(self, colors): + self.colors = colors + + def writeGff(self, fileName): + self.gffFileName = fileName + + def writeCsv(self, fileName): + self.csvFileName = fileName + + def mergePlots(self, merge): + self.merge = merge + + def _estimateSizes(self): + progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity) + self.sizes = {} + for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()): + chromosome = transcript.getChromosome() + start = transcript.getStart() + self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) + progress.inc() + progress.done() + + def _computeSliceSize(self): + if self.nbBins == 0: + return + tmp1 = int(max(self.sizes.values()) / float(self.nbBins)) + tmp2 = 10 ** (len("%d" % (tmp1))-2) + self.sliceSize = max(1, int((tmp1 / tmp2) * tmp2)) + if self.verbosity > 0: + print "choosing bin size of %d" % (self.sliceSize) + + def _initBins(self): + self.bins = {} + for chromosome in self.sizes: + self.bins[chromosome] = {} + for name in self.names: + self.bins[chromosome][name] = {} + for strand in TWOSTRANDS[self.twoStrands]: + if self.nbBins == 0: + self.bins[chromosome][name][strand] = {} + else: + self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)]) + + def _populateBins(self): + if self.nbTranscripts == None: + progress = UnlimitedProgress(10000, "Counting data", self.verbosity) + else: + progress = Progress(self.nbTranscripts, "Counting data", self.verbosity) + for transcript in self.parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + progress.inc() + chromosome = transcript.getChromosome() + start = transcript.getStart() + if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): + continue + strand = transcript.getDirection() if self.twoStrands else 0 + if self.nbBins != 0: + bin = (start / self.sliceSize) * self.sliceSize + 1 + else: + bin = start + for name in self.names: + value = float(transcript.tags.get(name, 1)) + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value + self.nbValues[name] = self.nbValues.get(name, 0) + value + progress.done() + + def _normalize(self): + average = float(sum(self.nbValues)) / len(self.nbValues.keys()) + factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + self.bins[chromosome][name][strand][bin] *= factors[name] + + def _computeAverage(self): + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + self.bins[chromosome][name][strand][bin] = float(self.bins[chromosome][name][strand][bin]) / self.sliceSize + + def _getPlotter(self, chromosome): + plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity) + plot.setImageSize(self.width, self.height) + if self.sizes[chromosome] <= 1000: + unit = "nt." + ratio = 1.0 + elif self.sizes[chromosome] <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + if self.yMin != None: + plot.setMinimumY(self.yMin) + if self.yMax != None: + plot.setMaximumY(self.yMax) + plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plot.setLegend(True) + for i, name in enumerate(self.bins[chromosome]): + for strand in self.bins[chromosome][name]: + fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + factor = 1 if strand == 0 else strand + correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()]) + plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None) + return plot + + def _plot(self): + if self.merge: + multiplePlot = MultipleRPlotter(self.outputFileName, self.verbosity) + multiplePlot.setImageSize(self.width, self.height * len(self.bins.keys())) + progress = Progress(len(self.bins.keys()), "Plotting", options.verbosity) + for chromosome in sorted(self.bins.keys()): + plot = self._getPlotter(chromosome) + if self.merge: + multiplePlot.addPlot(plot) + else: + plot.plot() + progress.inc() + if self.merge: + multiplePlot.plot() + progress.done() + + def _writeCsv(self): + if self.verbosity > 1: + print "Writing CSV file..." + csvHandle = open(self.csvFileName, "w") + csvHandle.write("chromosome;tag;strand") + if self.nbBins != 0: + xValues = range(self.start / self.sliceSize, max(self.sizes.values()) / self.sliceSize + 1) + for value in xValues: + csvHandle.write(";%d-%d" % (value * self.sliceSize + 1, (value+1) * self.sliceSize)) + csvHandle.write("\n") + else: + xValues = [] + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + xValues.extend(self.bins[chromosome][name][strand].keys()) + xValues = sorted(list(set(xValues))) + for value in xValues: + csvHandle.write(";%d" % (value)) + csvHandle.write("\n") + for chromosome in self.bins: + csvHandle.write("%s" % (chromosome)) + for name in self.bins[chromosome]: + csvHandle.write(";%s" % (name)) + for strand in self.bins[chromosome][name]: + csvHandle.write(";%s" % (STRANDTOSTR[strand])) + for bin in xValues: + csvHandle.write(";%.2f" % (self.bins[chromosome][name][strand].get(bin, 0))) + csvHandle.write("\n") + csvHandle.write(";") + csvHandle.write(";") + csvHandle.close() + if self.verbosity > 1: + print "...done" + + def _writeGff(self): + if self.verbosity > 1: + print "Writing GFF file..." + writer = Gff3Writer(self.gffFileName, self.verbosity) + cpt = 1 + for chromosome in self.bins: + for name in self.bins[chromosome]: + for strand in self.bins[chromosome][name]: + for bin in self.bins[chromosome][name][strand]: + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(bin) + if self.nbBins > 0: + transcript.setEnd(bin + self.sliceSize) + else: + transcript.setEnd(self.start) + transcript.setDirection(1 if strand == 0 else strand) + transcript.setTagValue("ID", "region%d" % (cpt)) + cpt += 1 + writer.write() + if self.verbosity > 1: + print "...done" + + def run(self): + if self.sizes == None: + self._estimateSizes() + self._computeSliceSize() + self._initBins() + self._populateBins() + if self.normalization: + self._normalize() + if self.average: + self._computeAverage() + self._plot() + if self.csvFileName != None: + self._writeCsv() + if self.gffFileName != None: + self._writeGff() + + +if __name__ == "__main__": + + description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") + parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") + parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") + parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") + parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + gt = GetDistribution(options.verbosity) + gt.setInputFile(options.inputFileName, options.format) + gt.setOutputFile(options.outputFileName) + gt.setReferenceFile(options.referenceFileName) + gt.setNbBins(int(options.nbBins)) + gt.set2Strands(options.bothStrands) + gt.setRegion(options.chromosome, options.start, options.end) + gt.setNormalization(options.normalize) + gt.setAverage(options.average) + gt.setYLimits(options.yMin, options.yMax) + gt.writeCsv(options.csv) + gt.writeGff(options.gff) + gt.setImageSize(options.height, options.width) + gt.setNames(options.names.split(",")) + gt.setColors(None if options.colors == None else options.colors.split(",")) + gt.setNormalization(options.normalize) + gt.mergePlots(options.mergePlots) + gt.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetFlanking.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetFlanking.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,233 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.Progress import Progress + +QUERY = 0 +REFERENCE = 1 +INPUTS = (QUERY, REFERENCE) +STRANDS = (-1, 1) +TAG_DISTANCE = "distance_" +TAG_SENSE = "_sense" +TAG_REGION = "_region" +TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"} +TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} +TAGS_SENSE = {-1: "antisense", 0: "", 1: "collinear"} +STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} + + +def getOrderKey(transcript, direction, input): + if direction == 1: + if input == QUERY: + return (transcript.getEnd(), -transcript.getStart()) + return (transcript.getStart(), -transcript.getEnd()) + if input == QUERY: + return (-transcript.getStart(), transcript.getEnd()) + return (-transcript.getEnd(), transcript.getStart()) + + +class GetFlanking(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.transcripts = dict([id, {}] for id in INPUTS) + self.directions = [] + self.noOverlap = False + self.colinear = False + self.antisense = False + self.distance = None + self.minDistance = None + self.maxDistance = None + self.tagName = "flanking" + + def setInputFile(self, fileName, format, id): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.transcripts[id]: + self.transcripts[id][chromosome] = [] + self.transcripts[id][chromosome].append(transcript) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def addUpstreamDirection(self, upstream): + if upstream: + self.directions.append(-1) + + def addDownstreamDirection(self, downstream): + if downstream: + self.directions.append(1) + + def setColinear(self, colinear): + self.colinear = colinear + + def setAntisense(self, antisense): + self.antisense = antisense + + def setNoOverlap(self, noOverlap): + self.noOverlap = noOverlap + + def setMinDistance(self, distance): + self.minDistance = distance + + def setMaxDistance(self, distance): + self.maxDistance = distance + + def setNewTagName(self, tagName): + self.tagName = tagName + + def match(self, transcriptQuery, transcriptRef, direction): + #print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction + if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart(): + return False + if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart(): + return False + if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): + return False + if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): + return False + if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): + return False + if self.minDistance != None or self.maxDistance != None: + distance = transcriptRef.getDistance(transcriptQuery) + if self.minDistance != None and distance < self.minDistance: + return False + if self.maxDistance != None and distance > self.maxDistance: + return False + return True + + def getFlanking(self, chromosome, direction): + if chromosome not in self.transcripts[REFERENCE]: + return + sortedTranscripts = dict([id, {}] for id in INPUTS) + for id in INPUTS: + sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id)) + refIndex = 0 + progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) + for query in sortedTranscripts[QUERY]: + #print "Q: ", query + #print "R1: ", sortedTranscripts[REFERENCE][refIndex] + while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction): + refIndex += 1 + if refIndex == len(sortedTranscripts[REFERENCE]): + progress.done() + #print "done" + return + #print "R2: ", sortedTranscripts[REFERENCE][refIndex] + self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex] + progress.inc() + progress.done() + + def setTags(self, query, reference, direction): + refName = reference.getTagValue("ID") + if refName == None: + refName = reference.getName() + if refName == None: + refName = reference.__str__() + query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName) + query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference)) + query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) + if direction == 0: + query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) + for tag in reference.getTagNames(): + if tag not in ("quality", "feature"): + query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag)) + return query + + def write(self): + progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) + for transcriptQuery in self.flankings.keys(): + if not self.flankings[transcriptQuery]: + self.writer.addTranscript(transcriptQuery) + elif self.directions: + for direction in self.directions: + #relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction + relativeDirection = direction * transcriptQuery.getDirection() + if relativeDirection in self.flankings[transcriptQuery]: + transcriptRef = self.flankings[transcriptQuery][relativeDirection] + transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection) + self.writer.addTranscript(transcriptQuery) + else: + transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0] + self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0)) + progress.inc() + progress.done() + + def run(self): + for chromosome in sorted(self.transcripts[QUERY].keys()): + self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome]) + for direction in STRANDS: + #print "comparison", chromosome, direction + self.getFlanking(chromosome, direction) + self.write() + self.writer.close() + +if __name__ == "__main__": + + description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]") + parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]") + parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") + parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]") + parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]") + parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + gf = GetFlanking(options.verbosity) + gf.setInputFile(options.inputFileName1, options.format1, QUERY) + gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) + gf.setOutputFile(options.outputFileName) + gf.addUpstreamDirection(options.upstream) + gf.addDownstreamDirection(options.downstream) + gf.setColinear(options.colinear) + gf.setAntisense(options.antisense) + gf.setNoOverlap(options.noOverlap) + gf.setMinDistance(options.minDistance) + gf.setMaxDistance(options.maxDistance) + gf.setNewTagName(options.tagName) + gf.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetIntersection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetIntersection.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,164 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +MINBIN = 3 +MAXBIN = 7 +REFERENCE = 0 +QUERY = 1 + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class GetIntersection(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.bins = {} + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.refParser = chooser.getParser(fileName) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def loadRef(self): + progress = UnlimitedProgress(10000, "Reading references", self.verbosity) + for transcript in self.refParser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + bin = getBin(transcript.getStart(), transcript.getEnd()) + if chromosome not in self.bins: + self.bins[chromosome] = {} + if bin not in self.bins[chromosome]: + self.bins[chromosome][bin] = [] + self.bins[chromosome][bin].append(transcript) + self.nbRefs += 1 + progress.inc() + progress.done() + + def _compareTranscript(self, queryTranscript): + queryChromosome = queryTranscript.getChromosome() + if queryChromosome not in self.bins: + return None + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + bins = getOverlappingBins(queryStart, queryEnd) + overlaps = [] + for binRange in bins: + for bin in range(binRange[0], binRange[1]+1): + if bin not in self.bins[queryChromosome]: + continue + for refTranscript in self.bins[queryChromosome][bin]: + newTranscript = queryTranscript.getIntersection(refTranscript) + if newTranscript != None: + overlaps.append(newTranscript) + if not overlaps: + return None + newTranscript = overlaps[0] + for transcript in overlaps[1:]: + newTranscript.merge(transcript) + return newTranscript + + def compare(self): + progress = UnlimitedProgress(10000, "Comparing queries", self.verbosity) + for queryTranscript in self.queryParser.getIterator(): + if queryTranscript.__class__.__name__ == "Mapping": + queryTranscript = queryTranscript.getTranscript() + progress.inc() + self.nbQueries += 1 + newTranscript = self._compareTranscript(queryTranscript) + if newTranscript != None: + self.writer.addTranscript(queryTranscript) + self.nbWritten += 1 + progress.done() + self.writer.close() + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d" % (self.nbWritten) + + def run(self): + self.loadRef() + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Get Intersection v1.0.0: Shrink the first data set so that all bases covered by the first data set is also covered by the second data set. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + gi = GetIntersection(options.verbosity) + gi.setQueryFile(options.inputFileName1, options.format1) + gi.setReferenceFile(options.inputFileName2, options.format2) + gi.setOutputFile(options.outputFileName) + gi.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetRandomSubset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetRandomSubset.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,96 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class GetRandomSubset(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setNumber(self, number, percent): + if number != None: + self.number = number + elif percent != None: + self.number = int(float(percent) / 100 * self.parser.getNbTranscripts()) + else: + raise Exception("Error! Number of elements to output is not given!") + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def chooseElements(self): + self.randomIndices = random.sample(range(self.parser.getNbTranscripts()), self.number) + + def run(self): + self.chooseElements() + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbWritten = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + if cpt1 in self.randomIndices: + self.writer.addTranscript(transcript) + nbWritten += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d transcripts written" % (nbWritten) + + +if __name__ == "__main__": + + description = "Get Random Subset v1.0.1: Get a random sub-set of a list of genomic coordinates. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="string", help="number of elements to output [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="string", help="percentage of elements to output (between 0 and 100) [format: int]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + grs = GetRandomSubset(options.verbosity) + grs.setInputFile(options.inputFileName, options.format) + grs.setNumber(options.number, options.percent) + grs.setOutputFile(options.outputFileName) + grs.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetReadDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,303 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, glob, subprocess +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" +DEFAULT_REGION = "_all_" +MULTIPLE_STR = {1: "", 1000: " (in kbp)", 1000000: " (in Gbp)"} + +class GetReadDistribution(object): + + def __init__(self, verbosity = 0): + self.xLab = "" + self.yLab = "# reads" + self.verbosity = verbosity + self.number = random.randint(0, 100000) + self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) + self.parsers = {} + self.distribution = {} + self.factors = {} + self.regions = None + self.tmpDatName = None + self.tmpRName = None + self.quorum = 1 + self.strands = False + self.width = 800 + self.height = 300 + self.arial = False + + def setNames(self, names): + self.names = names + + def setInputFiles(self, fileNames, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + for cpt, fileName in enumerate(fileNames): + self.parsers[self.names[cpt]] = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setLabs(self, xLab, yLab): + self.xLab = xLab + self.yLab = yLab + + def setBinSize(self, binSize): + self.binSize = binSize + + def setColors(self, colors): + self.colors = colors + + def setFactors(self, factors): + if factors == None: + self.factors = dict([name, 1.0] for name in self.names) + else: + self.factors = dict(zip(self.names, factors)) + + def setMultiple(self, boolean): + self.multiple = boolean + + def setImageSize(self, width, height): + if width != None: + self.width = width + if height != None: + self.height = height + + def setQuorum(self, quorum): + self.quorum = quorum + + def setRegionsFile(self, fileName): + if fileName != None: + self._loadRegions(fileName) + + def setBothStrands(self, strands): + self.strands = strands + + def setArial(self, arial): + self.arial = arial + + def _checkOptions(self): + if not self.parsers: + self.logAndRaise("ERROR: Missing input file names") + + def _logAndRaise(self, errorMsg): + self.log.error(errorMsg) + raise Exception(errorMsg) + + def _loadRegions(self, fileName): + self.regions = {} + parser = GffParser(fileName, self.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + name = transcript.getName() + if chromosome not in self.regions: + self.regions[chromosome] = {} + if start not in self.regions[chromosome]: + self.regions[chromosome][start] = {} + if end not in self.regions[chromosome][start]: + self.regions[chromosome][start][end] = [] + self.regions[chromosome][start][end].append(name) + + def _getRegions(self, transcript): + if self.regions == None: + return [DEFAULT_REGION] + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + if chromosome not in self.regions: + return [] + names = [] + for loadedStart in sorted(self.regions[chromosome].keys()): + if loadedStart > end: + return names + for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): + if loadedEnd < start: + break + names.extend(self.regions[chromosome][loadedStart][loadedEnd]) + return names + + def _parse(self, name): + progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) + for transcript in self.parsers[name].getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + regions = self._getRegions(transcript) + for region in regions: + if region not in self.distribution: + self.distribution[region] = {} + if name not in self.distribution[region]: + self.distribution[region][name] = {} + chromosome = transcript.getChromosome() + nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 + nbElements *= self.factors.get(name, 1) + strand = transcript.getDirection() if self.strands else 1 + if chromosome not in self.distribution[region][name]: + self.distribution[region][name][chromosome] = {} + if strand not in self.distribution[region][name][chromosome]: + self.distribution[region][name][chromosome][strand] = {} + previousBin = None + for exon in transcript.getExons(): + for pos in range(exon.getStart(), exon.getEnd()+1): + bin = pos / self.binSize + if bin != previousBin: + self.distribution[region][name][chromosome][strand][bin] = self.distribution[region][name][chromosome][strand].get(bin, 0) + nbElements + previousBin = bin + progress.inc() + progress.done() + + def _checkQuorum(self, region): + if self.quorum == None: + return True + return max([max([max([max(self.distribution[region][name][chromosome][strand].values()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) + + def _writeData(self, region): + self.tmpDatName = "tmpFile%d.dat" % (self.number) + handle = open(self.tmpDatName, "w") + handle.write("Chr\tPos\tStrand\tCount\tSample\n") + for name in self.distribution[region]: + for chromosome in sorted(self.distribution[region][name].keys()): + for strand in sorted(self.distribution[region][name][chromosome].keys()): + for pos in sorted(self.distribution[region][name][chromosome][strand].keys()): + handle.write("%s\t%d\t%d\t%d\t\"%s\"\n" % (chromosome, pos * self.binSize, strand, self.distribution[region][name][chromosome][strand].get(pos, 0) * strand, name)) + handle.close() + + def _findMultiple(self, region): + if not self.multiple: + return 1 + maxPosition = max([max([max([max(self.distribution[region][name][chromosome][strand].keys()) for strand in self.distribution[region][name][chromosome]]) for chromosome in self.distribution[region][name]]) for name in self.distribution[region]]) * self.binSize + if maxPosition > 2000000: + return 1000000 + elif maxPosition > 2000: + return 1000 + return 1 + + def _writeScript(self, region): + self.tmpRName = "tmpFile%d.R" % (self.number) + fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) + colors = "scale_fill_brewer(palette=\"Set1\") + scale_color_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s)) + scale_color_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors]), ", ".join(["\"%s\"" % (color) for color in self.colors])) + title = "" if region == DEFAULT_REGION else " + labs(title = \"Distribution of %s\") " % (region) + facet = "Sample ~ Chr" if region == DEFAULT_REGION else "Sample ~ ." + handle = open(self.tmpRName, "w") + multiple = self._findMultiple(region) + arial = ", text = element_text(family=\"Arial\", size=20)" if self.arial else "" + if self.arial: + handle.write("library(extrafont)\nloadfonts()\n") + handle.write("library(ggplot2)\n") + handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) + handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) + handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) + handle.write("ggplot(data, aes(x = Pos/%d, y = Count, fill = Sample, color = Sample)) %s + geom_bar(stat = \"identity\") + facet_grid(%s, space=\"free\") + xlab(\"%s%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (multiple, title, facet, self.xLab, MULTIPLE_STR[multiple], self.yLab, colors, arial)) + handle.write("dev.off()\n") + + def _runR(self): + rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R" + command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) + + def _plot(self): + progress = Progress(len(self.distribution), "Plotting data", self.verbosity) + for region in self.distribution: + if not self._checkQuorum(region): + self.log.info("Not displaying '%s' for it contains insufficient data." % (region)) + else: + self._writeData(region) + self._writeScript(region) + self._runR() + progress.inc() + progress.done() + + def _cleanFiles(self): + for fileName in (self.tmpDatName, self.tmpRName): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + for otherFileName in glob.glob("%s*" % (fileName)): + os.remove(otherFileName) + + def run(self): + LoggerFactory.setLevel(self.log, self.verbosity) + self._checkOptions() + self.log.info("START Get Read Distribution") + for name in self.names: + self._parse(name) + self._plot() + self._cleanFiles() + self.log.info("END Get Read Distribution") + + +if __name__ == "__main__": + description = "Usage: GetReadDistribution.py [options]\n\nGet Read Distribution v1.0.1: Get the distribution of a set of reads. [Category: Personal]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-s", "--binSize", dest="binSize", action="store", default=10000, type="int", help="bin size [format: int] [default: 10000]") + parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="", type="string", help="x-axis label name [format: string]") + parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") + parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") + parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") + parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") + parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="plot negative strands on the negative x-axis [format: boolean] [default: False]") + parser.add_option("-m", "--multiple", dest="multiple", action="store_true", default=False, help="use human readable genomic positions (k, G) [format: boolean] [default: False]") + parser.add_option("-q", "--quorum", dest="quorum", action="store", default=1, type="int", help="minimum number of intervals to plot a region [format: int] [default: 1]") + parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") + parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") + parser.add_option("-A", "--arial", dest="arial", action="store_true", default=False, help="use Arial font [format: boolean] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + options = parser.parse_args()[0] + iGetReadDistribution = GetReadDistribution(options.verbosity) + iGetReadDistribution.setNames(options.names.split(",")) + iGetReadDistribution.setInputFiles(options.inputFileNames.split(","), options.format) + iGetReadDistribution.setOutputFileName(options.outputFileName) + iGetReadDistribution.setLabs(options.xLab, options.yLab) + iGetReadDistribution.setBinSize(options.binSize) + iGetReadDistribution.setColors(None if options.colors == None else options.colors.split(",")) + iGetReadDistribution.setFactors(None if options.factors == None else map(float, options.factors.split(","))) + iGetReadDistribution.setRegionsFile(options.regionsFileName) + iGetReadDistribution.setMultiple(options.multiple) + iGetReadDistribution.setQuorum(options.quorum) + iGetReadDistribution.setImageSize(options.width, options.height) + iGetReadDistribution.setBothStrands(options.strands) + iGetReadDistribution.setArial(options.arial) + iGetReadDistribution.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetReadSizes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetReadSizes.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,262 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, glob, subprocess +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" +DEFAULT_REGION = "_all_" + +class GetReadSizes(object): + + def __init__(self, verbosity = 0): + self.xLab = "Size" + self.yLab = "# reads" + self.verbosity = verbosity + self.number = random.randint(0, 100000) + self.log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) + self.parsers = {} + self.sizes = {} + self.factors = {} + self.regions = None + self.tmpDatName = None + self.tmpRName = None + self.width = 800 + self.height = 300 + self.arial = False + + def setNames(self, names): + self.names = names + + def setInputFiles(self, fileNames, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + for cpt, fileName in enumerate(fileNames): + self.parsers[self.names[cpt]] = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setLabs(self, xLab, yLab): + self.xLab = xLab + self.yLab = yLab + + def setSizes(self, minSize, maxSize): + self.minSize = minSize + self.maxSize = maxSize + + def setColors(self, colors): + self.colors = colors + + def setFactors(self, factors): + self.factors = dict(zip(self.names, factors)) + + def setRegionsFile(self, fileName): + if fileName != None: + self._loadRegions(fileName) + + def setImageSize(self, width, height): + if width != None: + self.width = width + if height != None: + self.height = height + + def setArial(self, arial): + self.arial = arial + + def _checkOptions(self): + if not self.parsers: + self.logAndRaise("ERROR: Missing input file names") + + def _logAndRaise(self, errorMsg): + self.log.error(errorMsg) + raise Exception(errorMsg) + + def _loadRegions(self, fileName): + self.regions = {} + parser = GffParser(fileName, self.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + name = transcript.getName() + if chromosome not in self.regions: + self.regions[chromosome] = {} + if start not in self.regions[chromosome]: + self.regions[chromosome][start] = {} + if end not in self.regions[chromosome][start]: + self.regions[chromosome][start][end] = [] + self.regions[chromosome][start][end].append(name) + + def _getRegions(self, transcript): + if self.regions == None: + return [DEFAULT_REGION] + chromosome = transcript.getChromosome() + start = transcript.getStart() + end = transcript.getEnd() + if chromosome not in self.regions: + return [] + names = [] + for loadedStart in sorted(self.regions[chromosome].keys()): + if loadedStart > end: + return names + for loadedEnd in reversed(sorted(self.regions[chromosome][loadedStart].keys())): + if loadedEnd < start: + break + names.extend(self.regions[chromosome][loadedStart][loadedEnd]) + return names + + def _parse(self, name): + progress = UnlimitedProgress(10000, "Reading file '%s'" % (name), self.verbosity) + for transcript in self.parsers[name].getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + regions = self._getRegions(transcript) + for region in regions: + if region not in self.sizes: + self.sizes[region] = {} + if name not in self.sizes[region]: + self.sizes[region][name] = {} + size = transcript.getSize() + if (self.minSize == None or size >= self.minSize) and (self.maxSize == None or size <= self.maxSize): + nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1 + nbElements *= self.factors.get(name, 1) + self.sizes[region][name][size] = self.sizes[region][name].get(size, 0) + nbElements + progress.inc() + progress.done() + if self.minSize == None: + self.minSize = min([min(self.sizes[region][name].keys()) for name in self.names for region in region]) + if self.maxSize == None: + self.maxSize = max([max(self.sizes[region][name].keys()) for name in self.names for region in region]) + + def _checkQuorum(self, region): + return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0) + + def _writeData(self, region): + self.tmpDatName = "tmpFile%d.dat" % (self.number) + handle = open(self.tmpDatName, "w") + handle.write("Size\tCount\tSample\n") + for name in self.sizes[region]: + for size in sorted(self.sizes[region][name].keys()): + handle.write("%d\t%d\t\"%s\"\n" % (size, self.sizes[region][name].get(size, 0), name)) + handle.close() + + def _writeScript(self, region): + self.tmpRName = "tmpFile%d.R" % (self.number) + fileName = self.outputFileName if region == DEFAULT_REGION else "%s_%s.png" % (os.path.splitext(self.outputFileName)[0], region) + colors = "scale_fill_brewer(palette=\"Set1\")" if self.colors == None else "scale_fill_manual(values = c(%s))" % (", ".join(["\"%s\"" % (color) for color in self.colors])) + title = "" if region == DEFAULT_REGION else " + labs(title = \"Sizes of %s\")" % (region) + handle = open(self.tmpRName, "w") + arial = ", text = element_text(family=\"Arial\", size=20)" if self.arial else "" + if self.arial: + handle.write("library(extrafont)\nloadfonts()\n") + handle.write("library(ggplot2)\n") + handle.write("data <- read.table(\"%s\", header = T)\n" % (self.tmpDatName)) + handle.write("data$Sample <- factor(data$Sample, levels=c(%s))\n" % (", ".join(["\"%s\"" % (name) for name in self.names]))) + handle.write("data$Size <- factor(data$Size, levels=c(%s))\n" % (", ".join(["%d" % (size) for size in range(self.minSize, self.maxSize+1)]))) + handle.write("png(\"%s\", width = %d, height = %d)\n" % (fileName, self.width, self.height)) + handle.write("ggplot(data, aes(x = Size, y = Count, fill = Size)) %s + geom_bar(stat = \"identity\") + facet_grid(. ~ Sample, space=\"free_x\") + xlab(\"%s\") + ylab(\"%s\") + %s + theme(legend.position = \"none\", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()%s)\n" % (title, self.xLab, self.yLab, colors, arial)) + handle.write("dev.off()\n") + + def _runR(self): + rCommand = os.environ["SMARTRPATH"] if "SMARTRPATH" in os.environ else "R" + command = "\"%s\" CMD BATCH %s" % (rCommand, self.tmpRName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.tmpRName, status)) + + def _plot(self): + progress = Progress(len(self.sizes), "Plotting data", self.verbosity) + for region in self.sizes: + if not self._checkQuorum(region): + self.log.info("Not displaying '%s' for it contains no data." % (region)) + else: + self._writeData(region) + self._writeScript(region) + self._runR() + progress.inc() + progress.done() + + def _cleanFiles(self): + for fileName in (self.tmpDatName, self.tmpRName): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + for otherFileName in glob.glob("%s*" % (fileName)): + os.remove(otherFileName) + + def run(self): + LoggerFactory.setLevel(self.log, self.verbosity) + self._checkOptions() + self.log.info("START Get Read Sizes") + for name in self.names: + self._parse(name) + self._plot() + self._cleanFiles() + self.log.info("END Get Read Sizes") + + +if __name__ == "__main__": + description = "Usage: GetReadSizes.py [options]\n\nGet Read Sizes v1.0.1: Get the sizes of a set of reads. [Category: Personal]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileNames", action="store", default=None, type="string", help="input files, separated by commas [compulsory] [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name of the input data, separated by commas [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-s", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]") + parser.add_option("-S", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]") + parser.add_option("-l", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x-axis label name [format: string] [default: Size]") + parser.add_option("-L", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y-axis label name [format: string] [default: Reads]") + parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") + parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") + parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") + parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") + parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") + parser.add_option("-A", "--arial", dest="arial", action="store_true", default=False, help="use Arial font [format: boolean] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + options = parser.parse_args()[0] + iGetReadSizes = GetReadSizes(options.verbosity) + iGetReadSizes.setNames(options.names.split(",")) + iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format) + iGetReadSizes.setOutputFileName(options.outputFileName) + iGetReadSizes.setLabs(options.xLab, options.yLab) + iGetReadSizes.setSizes(options.minSize, options.maxSize) + iGetReadSizes.setColors(None if options.colors == None else options.colors.split(",")) + iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(","))) + iGetReadSizes.setRegionsFile(options.regionsFileName) + iGetReadSizes.setImageSize(options.width, options.height) + iGetReadSizes.setArial(options.arial) + iGetReadSizes.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/GetUpDownStream.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetUpDownStream.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,152 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +class GetUpDownStream(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputReader = None + self.outputWriter = None + self.nbRead = 0 + self.nbWritten = 0 + self.nbMerges = 0 + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) + + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "transcript") + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setOutputFile(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbosity) + + def setDistances(self, up, down): + self.upDistance = up + self.downDistance = down + + def _sortFile(self): + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbRead = fs.getNbElements() + + def _write(self, start, end, reference, after): + if start > end: + return + transcript = Transcript() + transcript.setChromosome(reference.getChromosome()) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setDirection("+") + transcript.setName("%s_%s" % ("up" if Utils.xor(reference.getDirection() == 1, after) else "down", reference.getName())) + self.outputWriter.addTranscript(transcript) + + def _getFlanking(self, chromosome): + progress = Progress(self.nbElementsPerChromosome[chromosome], "Analyzing chromosome %s" % (chromosome), self.verbosity) + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + previous = None + for transcript in parser.getIterator(): + progress.inc() + transcript.removeExons() + if previous == None: + distance = self.upDistance if transcript.getDirection() == 1 else self.downDistance + start = max(1, transcript.getStart() - distance) + self._write(start, transcript.getStart()-1, transcript, False) + previous = transcript + continue + if previous.include(transcript): + continue + if transcript.overlapWith(previous): + previous = transcript + continue + distancePrevious = self.downDistance if previous.getDirection() == 1 else self.upDistance + distanceCurrent = self.upDistance if transcript.getDirection() == 1 else self.downDistance + distance = transcript.getDistance(previous) + if distancePrevious + distanceCurrent == 0: + previous = transcript + continue + if distance >= distancePrevious + distanceCurrent: + endPrevious = previous.getEnd() + distancePrevious + startCurrent = transcript.getStart() - distanceCurrent + else: + middle = previous.getEnd() + int((distance-1) * float(distancePrevious) / (distancePrevious + distanceCurrent)) + endPrevious = middle + startCurrent = middle+1 + self._write(previous.getEnd() + 1, endPrevious, previous, True) + self._write(startCurrent, transcript.getStart() - 1, transcript, False) + previous = transcript + distance = self.downDistance if previous.getDirection() == 1 else self.upDistance + self._write(previous.getEnd() + 1, previous.getEnd() + distance, previous, True) + progress.done() + + def run(self): + self._sortFile() + for chromosome in sorted(self.nbElementsPerChromosome.keys()): + self._getFlanking(chromosome) + self.outputWriter.close() + +if __name__ == "__main__": + + # parse command line + description = "Get Up and Down Stream v1.0.0: Get the flanking regions of an annotation. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-u", "--up", dest="up", action="store", default=0, type="int", help="the upstream distance [format: int]") + parser.add_option("-d", "--down", dest="down", action="store", default=0, type="int", help="the downstream distance [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + guds = GetUpDownStream(options.verbosity) + guds.setInputFile(options.inputFileName, options.format) + guds.setOutputFile(options.outputFileName) + guds.setDistances(options.up, options.down) + guds.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/RestrictFromCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/RestrictFromCoverage.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,224 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, struct, time, random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc import Utils +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class RestrictFromCoverage(object): + + def __init__(self, verbosity = 1): + self._verbosity = verbosity + self._randomNumber = random.randint(0, 100000) + self._nbWritten = 0 + self._nbLines = dict([type, 0] for type in TYPES) + self._splittedFileNames = dict([type, {}] for type in TYPES) + self._nbElementsPerChromosome = dict([type, {}] for type in TYPES) + self._nbElements = dict([type, 0] for type in TYPES) + + def __del__(self): + pass + + def _close(self): + self._writer.close() + + def setInputFileName(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + sortedFileName = "%s_%d_%d_sorted.pkl" % (os.path.splitext(fileName)[0], self._randomNumber, type) + if self._verbosity > 2: + print "Preparing %s file..." % (TYPETOSTRING[type]) + startTime = time.time() + fs = FileSorter(parser, self._verbosity-1) + fs.perChromosome(True) + fs.setOutputFileName(sortedFileName) + fs.sort() + self._nbLines[type] = fs.getNbElements() + self._splittedFileNames[type] = fs.getOutputFileNames() + self._nbElementsPerChromosome[type] = fs.getNbElementsPerChromosome() + self._nbElements[type] = fs.getNbElements() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def setOutputFileName(self, outputFileName): + self._writer = Gff3Writer(outputFileName) + + def setPercent(self, minPercent, maxPercent): + self._minPercent = minPercent + self._maxPercent = maxPercent + + def setNbNucleotides(self, minNb, maxNb): + self._minNucleotides = minNb + self._maxNucleotides = maxNb + + def setOverlap(self, minOverlap, maxOverlap): + self._minOverlap = minOverlap + self._maxOverlap = maxOverlap + + def setStrands(self, boolean): + self._twoStrands = boolean + + def _compareChromosome(self, chromosome): + firstOverlap = 0 + parser1 = NCListFileUnpickle(self._splittedFileNames[QUERY][chromosome], self._verbosity) + parser2 = NCListFileUnpickle(self._splittedFileNames[REFERENCE][chromosome], self._verbosity) + progress = Progress(self._nbElementsPerChromosome[QUERY][chromosome], "Analyzing %s" % (chromosome), self._verbosity) + for transcript1 in parser1.getIterator(): + firstOverlap = self._compareList(transcript1, parser2) + parser2.setInitAddress(firstOverlap) + progress.inc() + progress.done() + + def _compareList(self, transcript1, parser2): + values = [] + for exon in transcript1.getExons(): + values.append([0.0] * exon.getSize()) + firstOverlap = None + for transcript2 in parser2.getIterator(): + address = parser2.getCurrentTranscriptAddress() + nbElements = float(transcript2.getTagValue("nbElements")) if "nbElements" in transcript2.getTagNames() else 1.0 + nbOccurrences = float(transcript2.getTagValue("nbOccurrences")) if "nbOccurrences" in transcript2.getTagNames() else 1.0 + nbElements /= nbOccurrences + if transcript2.getStart() > transcript1.getEnd(): + if firstOverlap == None: + firstOverlap = address + if self._checkValues(values): + self._printTranscript(transcript1) + return firstOverlap + elif transcript1.overlapWith(transcript2): + if firstOverlap == None: + firstOverlap = address + values = self._compareTranscript(transcript1, transcript2, values, nbElements) + if self._checkValues(values): + self._printTranscript(transcript1) + return firstOverlap + + def _compareTranscript(self, transcript1, transcript2, values, nbElements): + if not transcript1.overlapWith(transcript2) or ((self._twoStrands) and transcript1.getDirection() != transcript2.getDirection()): + return values + for id1, exon1 in enumerate(transcript1.getExons()): + for exon2 in transcript2.getExons(): + values[id1] = map(sum, zip(values[id1], self._compareExon(exon1, exon2, nbElements))) + return values + + def _compareExon(self, exon1, exon2, nbElements): + array = [0.0] * exon1.getSize() + if not exon1.overlapWith(exon2) or ((self._twoStrands) and exon1.getDirection() != exon2.getDirection()): + return array + for pos in range(max(exon1.getStart(), exon2.getStart()) - exon1.getStart(), min(exon1.getEnd(), exon2.getEnd()) - exon1.getStart()+1): + array[pos] += nbElements + return array + + def _filter(self, value): + if self._minOverlap and self._maxOverlap: + return self._minOverlap <= value <= self._maxOverlap + if self._minOverlap: + return self._minOverlap <= value + if self._maxOverlap: + return value <= self._maxOverlap + return True + + def _checkValues(self, values): + nbValues = sum(map(len, values)) + nbPosValues = sum(map(len, [filter(self._filter, valuePart) for valuePart in values])) + ratio = float(nbPosValues) / nbValues * 100 + if self._minNucleotides and nbPosValues < self._minNucleotides: + return False + if self._maxNucleotides and nbPosValues > self._maxNucleotides: + return False + if self._minPercent and ratio < self._minPercent: + return False + if self._maxPercent and ratio > self._maxPercent: + return False + return True + + def _printTranscript(self, transcript): + self._writer.addTranscript(transcript) + self._nbWritten += 1 + + def run(self): + for chromosome in sorted(self._splittedFileNames[QUERY].keys()): + self._compareChromosome(chromosome) + self._close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbElements[QUERY]) + print "# refs: %d" % (self._nbElements[REFERENCE]) + print "# written: %d (%d%%)" % (self._nbWritten, 0 if self._nbElements[QUERY] == 0 else round(float(self._nbWritten) / self._nbElements[QUERY] * 100)) + + +if __name__ == "__main__": + description = "Restrict From Coverage v1.0.0: Select the elements from the first set which have a given coverage. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-n", "--minNucleotides", dest="minNucleotides", action="store", default=None, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-N", "--maxNucleotides", dest="maxNucleotides", action="store", default=None, type="int", help="maximum number of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-p", "--minPercent", dest="minPercent", action="store", default=None, type="int", help="minimum percentage of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-P", "--maxPercent", dest="maxPercent", action="store", default=None, type="int", help="maximum percentage of nucleotides overlapping to declare an overlap [format: int]") + parser.add_option("-e", "--minOverlap", dest="minOverlap", action="store", default=None, type="int", help="minimum number of elements from 2nd file to declare an overlap [format: int]") + parser.add_option("-E", "--maxOverlap", dest="maxOverlap", action="store", default=None, type="int", help="maximum number of elements from 2nd file to declare an overlap [format: int]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rfc = RestrictFromCoverage(options.verbosity) + rfc.setInputFileName(options.inputFileName1, options.format1, QUERY) + rfc.setInputFileName(options.inputFileName2, options.format2, REFERENCE) + rfc.setOutputFileName(options.output) + rfc.setNbNucleotides(options.minNucleotides, options.maxNucleotides) + rfc.setPercent(options.minPercent, options.maxPercent) + rfc.setOverlap(options.minOverlap, options.maxOverlap) + rfc.setStrands(options.strands) + rfc.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/SelectByTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/SelectByTag.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,148 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Select the transcript such that a tag value is not less than a given threshold""" +import os +import sys +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer import MySqlTranscriptWriter +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + +class SelectByTag(object): + + def __init__(self, verbosity = 1): + self.input = None + self.format = None + self.tag = None + self.value = None + self.min = None + self.max = None + self.default = None + self.output = None + self.mysql = None + self.verbosity = verbosity + + self.parser = None + self.writer = None + self.mysqlWriter = None + self.nbElements = None + self.nbWritten = 0 + + + def setParser(self): + self.parser = TranscriptContainer(self.input, self.format, self.verbosity) + self.nbElements = self.parser.getNbTranscripts() + + + def setWriter(self): + self.writer = Gff3Writer(self.output, self.verbosity) + if self.mysql: + self.mysqlWriter = MySqlTranscriptWriter(self.output, self.verbosity) + + + def isAccepted(self, transcript): + value = transcript.getTagValue(self.tag) + if value == None: + if self.default != None: + value = self.default + else: + raise Exception("Error! Transcript %s no tag called '%s'" % (transcript, self.tag)) + if self.value != None: + if self.value == str(value): + return True + return self.value.isdigit() and value == float(self.value) + value = float(value) + return (self.min == None or self.min <= value) and (self.max == None or self.max >= value) + + + def readInputFile(self): + progress = Progress(self.parser.getNbTranscripts(), "Writing transcripts", self.verbosity) + for transcript in self.parser.getIterator(): + if self.isAccepted(transcript): + self.writer.addTranscript(transcript) + if self.mysql: + self.mysqlWriter.addTranscript(transcript) + self.nbWritten += 1 + progress.inc() + progress.done() + + + def writeFile(self): + self.writer.write() + if self.mysql: + self.mysqlWriter.write() + + + def run(self): + self.setParser() + self.setWriter() + self.readInputFile() + self.writeFile() + if self.verbosity > 0: + print "%d input" % (self.nbElements) + if self.nbElements != 0: + print "%d output (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbElements * 100) + + + +if __name__ == "__main__": + + # parse command line + description = "Select by Tag v1.0.2: Keep the genomic coordinates such that a the value of a given tag is between two limits. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="the tag [compulsory] [format: string]") + parser.add_option("-a", "--value", dest="value", action="store", default=None, type="string", help="the value to be found [format: string]") + parser.add_option("-m", "--min", dest="min", action="store", default=None, type="float", help="the minimum threshold [format: float]") + parser.add_option("-M", "--max", dest="max", action="store", default=None, type="float", help="the maximum threshold [format: float]") + parser.add_option("-d", "--default", dest="default", action="store", default=None, type="float", help="value if tag is not present [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="write output into MySQL tables [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + selectByTag = SelectByTag(options.verbosity) + selectByTag.input = options.inputFileName + selectByTag.format = options.format + selectByTag.tag = options.tag + selectByTag.value = options.value + selectByTag.min = options.min + selectByTag.max = options.max + selectByTag.default = options.default + selectByTag.output = options.outputFileName + selectByTag.mysql = options.mysql + selectByTag.run() + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/WrappGetDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappGetDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,96 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") + parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") + parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappGetDistribution" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/getDistribution.py -i %s -f %s -o %s -D %s" % (SMART_PATH, options.inputFileName, options.format, outputFileName, directory) + if options.referenceFileName != None : + cmd += " -r %s" % options.referenceFileName + if options.nbBins != None : + cmd += " -n %s" % options.nbBins + if options.chromosome : + cmd += " -c %s" % options.chromosome + if options.start != None : + cmd += " -s %s" % options.start + if options.end != None : + cmd += " -e %s" % options.end + if options.yMin != None : + cmd += " -y %s" % options.yMin + if options.yMax != None : + cmd += " -Y %s" % options.yMax + if options.height != None : + cmd += " -H %s" % options.height + if options.width != None : + cmd += " -W %s" % options.width + if options.bothStrands : + cmd += " -2" + if options.raw : + cmd += " -w" + if options.csv : + cmd += " -x" + if options.gff : + cmd += " -g" + if options.log : + cmd += " -l" + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/WrappGetReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappGetReadDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,58 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + # parse command line + description = "Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file sequence [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="keep the best n [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="float", help="keep the best n\% [format: float]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: zip]") + + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappGetReadDistribution" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/getReadDistribution.py -i %s -f %s -o %s -D %s" % (SMART_PATH, options.inputFileName, options.format, outputFileName, directory) + if options.number != None : + cmd += " -n %s" % options.number + if options.percent != None : + cmd += " -p %s" % options.percent + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/WrappPlotCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappPlotCoverage.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,89 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + + +if __name__ == "__main__": + + # parse command line + description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: output file in zip format]") + parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") + parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") + parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") + parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") + parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") + parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") + parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") + parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") + parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") + parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + absPath = os.getcwd() + directory = "/tmp/wrappPlotCov" + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName1 != None and options.inputFormat1 != None and options.inputFileName2 != None and options.inputFormat2 != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + print 'outputfile is :', outputFileName + cmd = "python %s/Java/Python/plotCoverage.py -i %s -f %s -j %s -g %s -o %s -D %s" % (SMART_PATH, options.inputFileName1, options.inputFormat1, options.inputFileName2, options.inputFormat2, outputFileName, directory) + if options.inputSequence!= None: + cmd += " -q %s" % options.inputSequence + if options.width != None: + cmd += " -w %s" % options.width + if options.height != None: + cmd += " -e %s" % options.height + if options.title != None: + cmd += " -t %s" % options.title + if options.xLabel != None: + cmd += " -x %s" % options.xLabel + if options.yLabel != None: + cmd += " -y %s" % options.yLabel + if options.plusColor != None: + cmd += " -p %s" % options.plusColor + if options.minusColor != None: + cmd += " -m %s" % options.minusColor + if options.sumColor != None: + cmd += " -s %s" % options.sumColor + if options.lineColor != None: + cmd += " -l %s" % options.lineColor + if options.merge: + cmd += " -1" + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + + + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/WrappPlotRepartition.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappPlotRepartition.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,71 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%sSMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Plot the repartition of different data on a whole genome. (This tool uses 1 input file only, the different values being stored in the tags. See documentation to know more about it.) [Category: Visualization]" + + + parser = OptionParser(description = description) + parser.add_option("-i", "--input",dest="inputFileName",action="store",type="string",help="input file name [compulsory] [format: file in GFF3 format]") + parser.add_option("-n", "--names",dest="names", action="store", type="string", help="name for the tags (separated by commas and no space) [compulsory] [format: string]") + parser.add_option("-o", "--output",dest="outTarFileName",action="store",type="string", help="output file [compulsory] [format: output file tar format]") + parser.add_option("-c", "--color",dest="colors",action="store",default=None,type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-f", "--format",dest="format",action="store",default="png",type="string", help="format of the output file [format: string] [default: png]") + parser.add_option("-r", "--normalize",dest="normalize",action="store_true", default=False,help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-l", "--log",dest="log",action="store",default="",type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappPlotRepartition" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/plotRepartition.py -i %s -o %s -D %s" % (SMART_PATH, options.inputFileName, outputFileName, directory) + if options.names != None : + cmd += " -n %s" % options.names + else: print "You must choose tag names !" + if options.colors != None : + cmd += " -c %s" % options.colors + if options.format != None: + cmd += " -f %s" % options.format + if options.normalize : + cmd += " -r " + if options.log != "" : + cmd += " -l %s" % options.log + + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/adaptorStripper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/adaptorStripper.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove adaptors""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.Sequence import Sequence +from SMART.Java.Python.structure.SequenceList import SequenceList +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +def distance (string1, string2): + if len(string1) != len(string2): + return None + distance = 0 + for i in range(0, len(string1)): + if string1[i] != string2[i]: + distance += 1 + return distance + + + +if __name__ == "__main__": + nbRemaining = 0 + + # parse command line + description = "Adaptor Stripper v1.0.1: Remove the adaptor of a list of reads. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-5", "--5primeAdaptor", dest="fivePrimeAdaptor", action="store", type="string", help="five prime adaptor [format: string]") + parser.add_option("-3", "--3primeAdaptor", dest="threePrimeAdaptor", action="store", type="string", help="three prime adaptor [format: string]") + parser.add_option("-d", "--5primeDist", dest="fivePrimeDistance", action="store", default=3, type="int", help="five prime distance [format: int] [default: 3]") + parser.add_option("-e", "--3primeDist", dest="threePrimeDistance", action="store", default=3, type="int", help="three prime distance [format: int [default: 3]]") + parser.add_option("-m", "--3primeSize", dest="threePrimeSize", action="store", default=10, type="int", help="three prime size [format: int] [default: 10]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open(options.outputFileName + ".log", "w") + + + writer = FastaWriter(options.outputFileName + ".fas", options.verbosity) + sequenceParser = FastaParser(options.inputFileName, options.verbosity) + nbSequences = sequenceParser.getNbSequences() + + # treat sequences + progress = Progress(sequenceParser.getNbSequences(), "Analyzing " + options.inputFileName, options.verbosity) + for sequence in sequenceParser.getIterator(): + fivePrimeAdaptor = sequence.getSequence()[0:len(options.fivePrimeAdaptor)] + threePrimeAdaptor = sequence.getSequence()[len(sequence.sequence)-len(options.threePrimeAdaptor):] + + # check 5' adaptor + fivePrimeDistance = distance(fivePrimeAdaptor, options.fivePrimeAdaptor) + # check 3' adaptor + threePrimeDistance = len(threePrimeAdaptor) + for i in range(options.threePrimeSize, len(threePrimeAdaptor)+1): + threePrimeDistance = min(threePrimeDistance, distance(threePrimeAdaptor[-i:], options.threePrimeAdaptor[:i])) + + # sort candidates + if fivePrimeDistance > options.fivePrimeDistance: + if options.log: + logHandle.write("Sequence %s does not start with the right adaptor (%s != %s)\n" % (sequence.getSequence(), fivePrimeAdaptor, options.fivePrimeAdaptor)) + elif threePrimeDistance > options.threePrimeDistance: + if options.log: + logHandle.write("Sequence %s does not end with the right adaptor (%s != %s)\n" % (sequence.getSequence(), threePrimeAdaptor, options.threePrimeAdaptor)) + else: + nbRemaining += 1 + sequence.setSequence(sequence.getSequence()[len(options.fivePrimeAdaptor):len(sequence.getSequence())-len(options.threePrimeAdaptor)]) + writer.addSequence(sequence) + + progress.inc() + + progress.done() + + if options.log: + logHandle.close() + + writer.write() + + print "kept %i over %i (%.f%%)" % (nbRemaining, nbSequences, float(nbRemaining) / nbSequences * 100) + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/changeGffFeatures.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/changeGffFeatures.sh Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,2 @@ +#!/bin/bash +sed "s/\t$2\t/\t$3\t/g" $1 diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/changeTagName.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/changeTagName.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,90 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Change the name of a tag +""" + +import os +import random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from commons.core.writer.Gff3Writer import Gff3Writer + + +if __name__ == "__main__": + + # parse command line + description = "Change Tag Name v1.0.1: Change the name of tag of a list of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="name of the tag to change [compulsory] [format: string]") + parser.add_option("-n", "--name", dest="name", action="store", type="string", help="new name for the tag [compulsory] [format: string]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open("%s.log" % options.outputFileName, "w") + + # create parser and writer(s) + parser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + tmpFileName = "tmpTranscriptFile%d.gff3" % (random.randint(0, 100000)) + writer = Gff3Writer(tmpFileName, options.verbosity) + if options.mysql: + mysqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + outputData = {} + + # process transcripts + progress = Progress(parser.getNbTranscripts(), "Printing transcripts %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + if options.tag in transcript.tags: + value = transcript.tags[options.tag] + del transcript.tags[options.tag] + transcript.tags[options.name] = value + writer.addTranscript(transcript) + if options.mysql: + mysqlWriter.addTranscript(transcript) + progress.inc() + progress.done() + parser.transcriptListParser.close() + + writer.write() + + if options.mysql: + mysqlWriter.write() + + os.rename(tmpFileName, options.outputFileName) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleanGff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleanGff.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,195 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GFF file (as given by NCBI or TAIR) and outputs a GFF3 file. +""" + +import os +import re +from optparse import OptionParser +from commons.core.parsing.GffParser import * +from SMART.Java.Python.misc.RPlotter import * +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + self.getId() + self.getParents() + + def parseOptions(self): + self.parsedOptions = {} + for option in self.splittedLine[8].split(";"): + option = option.strip() + if option == "": continue + posSpace = option.find(" ") + posEqual = option.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + key, value = option.split("=", 1) + elif posSpace != -1: + key, value = option.split(None, 1) + else: + key = "ID" + value = option + self.parsedOptions[key.strip()] = value.strip(" \"") + + def getId(self): + for key in self.parsedOptions: + if key.lower() == "id": + self.id = self.parsedOptions[key] + return + if "Parent" in self.parsedOptions: + parent = self.parsedOptions["Parent"].split(",")[0] + if parent not in count: + count[parent] = {} + if self.type not in count[parent]: + count[parent][self.type] = 0 + count[parent][self.type] += 1 + self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) + else: + self.id = "smart%d" % (self.cpt) + self.parsedOptions["ID"] = self.id + + def getParents(self): + for key in self.parsedOptions: + if key.lower() in ("parent", "derives_from"): + self.parents = self.parsedOptions[key].split(",") + return + self.parents = None + + def removeParent(self): + for key in self.parsedOptions.keys(): + if key.lower() in ("parent", "derives_from"): + del self.parsedOptions[key] + + def export(self): + self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) + return "%s\n" % ("\t".join(self.splittedLine)) + + +class CleanGff(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.lines = {} + self.acceptedTypes = [] + self.parents = [] + self.children = {} + + def setInputFileName(self, name): + self.inputFile = open(name) + + def setOutputFileName(self, name): + self.outputFile = open(name, "w") + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputFile): + if not line or line[0] == "#": continue + if line[0] == ">": break + parsedLine = ParsedLine(line, cpt) + if parsedLine.type in self.acceptedTypes: + self.lines[parsedLine.id] = parsedLine + progress.inc() + progress.done() + + def sort(self): + progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) + for line in self.lines.values(): + parentFound = False + if line.parents: + for parent in line.parents: + if parent in self.lines: + parentFound = True + if parent in self.children: + self.children[parent].append(line) + else: + self.children[parent] = [line] + if not parentFound: + line.removeParent() + self.parents.append(line) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents), "Writing output file", self.verbosity) + for line in self.parents: + self.writeLine(line) + progress.inc() + self.outputFile.close() + progress.done() + + def writeLine(self, line): + self.outputFile.write(line.export()) + if line.id in self.children: + for child in self.children[line.id]: + self.writeLine(child) + + def run(self): + self.parse() + self.sort() + self.write() + + +if __name__ == "__main__": + + # parse command line + description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cleanGff = CleanGff(options.verbosity) + cleanGff.setInputFileName(options.inputFileName) + cleanGff.setOutputFileName(options.outputFileName) + cleanGff.setAcceptedTypes(options.types.split(",")) + cleanGff.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleaning/CleanerChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/CleanerChooser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,80 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.cleaning.GffCleaner import GffCleaner +from SMART.Java.Python.cleaning.GtfCleaner import GtfCleaner +from SMART.Java.Python.cleaning.DefaultCleaner import DefaultCleaner + +#Attention!! Do not delete the imports!! They are used to know the type of file format!!! + +class CleanerChooser(object): + """ + A class that finds the correct cleaner + @ivar format: the format + @type format: string + @ivar cleaner: the parser + @type cleaner: object + @ivar cleanerClass: the class of the parser + @type cleanerClass: class + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.verbosity = verbosity + + + def findFormat(self, format): + """ + Find the correct parser + @ivar format: the format + @type format: string + @return: a cleaner + """ + for cleanerClass in TranscriptListCleaner.__subclasses__(): + if cleanerClass != None: + if cleanerClass.getFileFormats() != None and format in cleanerClass.getFileFormats(): + self.cleanerClass = cleanerClass + return + self.cleanerClass = DefaultCleaner + + + def getCleaner(self): + """ + Get the parser previously found + @return: the parser + """ + return self.cleanerClass(self.verbosity) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleaning/DefaultCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/DefaultCleaner.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,45 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Default cleaner. Does nothing but copying. +""" +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +class DefaultCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(DefaultCleaner, self).__init__(verbosity) + + def _clean(self): + self.outputHandle.write(self.inputHandle.read()) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleaning/GffCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/GffCleaner.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,168 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GFF file (as given by NCBI or TAIR) and outputs a GFF3 file. +""" + +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + self.getId() + self.getParents() + + def parseOptions(self): + self.parsedOptions = {} + for option in self.splittedLine[8].split(";"): + option = option.strip() + if option == "": continue + posSpace = option.find(" ") + posEqual = option.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + key, value = option.split("=", 1) + elif posSpace != -1: + key, value = option.split(None, 1) + else: + key = "ID" + value = option + self.parsedOptions[key.strip()] = value.strip(" \"") + + def getId(self): + for key in self.parsedOptions: + if key.lower() == "id": + self.id = self.parsedOptions[key] + return + if "Parent" in self.parsedOptions: + parent = self.parsedOptions["Parent"].split(",")[0] + if parent not in count: + count[parent] = {} + if self.type not in count[parent]: + count[parent][self.type] = 0 + count[parent][self.type] += 1 + self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) + else: + self.id = "smart%d" % (self.cpt) + self.parsedOptions["ID"] = self.id + + def getParents(self): + for key in self.parsedOptions: + if key.lower() in ("parent", "derives_from"): + self.parents = self.parsedOptions[key].split(",") + return + self.parents = None + + def removeParent(self): + for key in self.parsedOptions.keys(): + if key.lower() in ("parent", "derives_from"): + del self.parsedOptions[key] + + def export(self): + self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) + return "%s\n" % ("\t".join(self.splittedLine)) + + +class GffCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(GffCleaner, self).__init__(verbosity) + self.lines = {} + self.acceptedTypes = ["mRNA", "transcript", "exon"] + self.parents = [] + self.children = {} + + def getFileFormats(): + return ["gff", "gff2", "gff3"] + getFileFormats = staticmethod(getFileFormats) + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputHandle): + if not line or line[0] == "#": continue + if line[0] == ">": break + parsedLine = ParsedLine(line, cpt) + if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: + self.lines[parsedLine.id] = parsedLine + progress.inc() + progress.done() + + def sort(self): + progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) + for line in self.lines.values(): + parentFound = False + if line.parents: + for parent in line.parents: + if parent in self.lines: + parentFound = True + if parent in self.children: + self.children[parent].append(line) + else: + self.children[parent] = [line] + if not parentFound: + line.removeParent() + self.parents.append(line) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents), "Writing output file", self.verbosity) + for line in self.parents: + self.writeLine(line) + progress.inc() + progress.done() + + def writeLine(self, line): + self.outputHandle.write(line.export()) + if line.id in self.children: + for child in self.children[line.id]: + self.writeLine(child) + + def _clean(self): + self.parse() + self.sort() + self.write() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleaning/GtfCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/GtfCleaner.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,121 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GTF file +""" + +import shlex +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + + def parseOptions(self): + self.parsedOptions = {} + key = None + value = "" + for option in shlex.split(self.splittedLine[8]): + option = option.strip() + if option == "": continue + if key == None: + key = option + else: + endValue = False + if option[-1] == ";": + endValue = True + option.rstrip(";") + value = "%s \"%s\"" % (value, option) + if endValue: + self.parsedOptions[key] = value + if key == "transcript_id": + self.transcriptId = value + key = None + value = "" + + def export(self): + return "%s\n" % (self.line) + + +class GtfCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(GtfCleaner, self).__init__(verbosity) + self.acceptedTypes = ["exon"] + self.parents = {} + + def getFileFormats(): + return ["gtf"] + getFileFormats = staticmethod(getFileFormats) + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputHandle): + if not line or line[0] == "#": continue + parsedLine = ParsedLine(line, cpt) + if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: + transcriptId = parsedLine.transcriptId + if transcriptId not in self.parents: + self.parents[parsedLine.transcriptId] = [parsedLine] + else: + self.parents[parsedLine.transcriptId].append(parsedLine) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents.keys()), "Writing output file", self.verbosity) + for parent in sorted(self.parents.keys()): + for line in self.parents[parent]: + self.outputHandle.write(line.export()) + progress.inc() + progress.done() + + def _clean(self): + self.parse() + self.write() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleaning/TranscriptListCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/TranscriptListCleaner.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,63 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class TranscriptListCleaner(object): + """A (quite generic) class that cleans a file containing transcripts""" + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + + def setInputFileName(self, fileName): + try: + self.inputHandle = open(fileName) + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + + def setOutputFileName(self, fileName): + try: + self.outputHandle = open(fileName, "w") + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + + def getFileFormats(): + pass + getFileFormats = staticmethod(getFileFormats) + + def close(self): + self.inputHandle.close() + self.outputHandle.close() + + def clean(self): + self._clean() + self.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/cleaning/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/clusterize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/clusterize.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,185 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.WriterChooser import WriterChooser +"""Clusterize a set of transcripts""" + +import os, os.path, random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class Clusterize(object): + + def __init__(self, verbosity): + self.normalize = False + self.presorted = False + self.distance = 1 + self.colinear = False + self.nbWritten = 0 + self.nbMerges = 0 + self.verbosity = verbosity + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) + + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format) + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) + if "SMARTTMPPATH" in os.environ: + self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName)) + + def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): + writerChooser = WriterChooser() + writerChooser.findFormat(format) + self.writer = writerChooser.getWriter(fileName) + self.writer.setTitle(title) + self.writer.setFeature(feature) + self.writer.setFeaturePart(featurePart) + + def setDistance(self, distance): + self.distance = distance + + def setColinear(self, colinear): + self.colinear = colinear + + def setNormalize(self, normalize): + self.normalize = normalize + + def setPresorted(self, presorted): + self.presorted = presorted + + def _sortFile(self): + if self.presorted: + return + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setPresorted(self.presorted) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbElements = fs.getNbElements() + + def _iterate(self, chromosome): + if chromosome == None: + progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) + parser = self.parser + else: + progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + transcripts = [] + self.nbElements = 0 + for newTranscript in parser.getIterator(): + newTranscripts = [] + if newTranscript.__class__.__name__ == "Mapping": + newTranscript = newTranscript.getTranscript() + for oldTranscript in transcripts: + if self._checkOverlap(newTranscript, oldTranscript): + self._merge(newTranscript, oldTranscript) + elif self._checkPassed(newTranscript, oldTranscript): + self._write(oldTranscript) + else: + newTranscripts.append(oldTranscript) + newTranscripts.append(newTranscript) + transcripts = newTranscripts + self.nbElements += 1 + progress.inc() + for transcript in transcripts: + self._write(transcript) + progress.done() + + def _merge(self, transcript1, transcript2): + self.nbMerges += 1 + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2) + + def _write(self, transcript): + self.nbWritten += 1 + self.writer.addTranscript(transcript) + + def _checkOverlap(self, transcript1, transcript2): + if transcript1.getChromosome() != transcript2.getChromosome(): + return False + if self.colinear and transcript1.getDirection() != transcript2.getDirection(): + return False + if transcript1.getDistance(transcript2) > self.distance: + return False + return True + + def _checkPassed(self, transcript1, transcript2): + return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) + + def run(self): + self._sortFile() + if self.presorted: + self._iterate(None) + else: + for chromosome in sorted(self.splittedFileNames.keys()): + self._iterate(chromosome) + self.writer.close() + if self.verbosity > 0: + print "# input: %d" % (self.nbElements) + print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) + print "# merges: %d" % (self.nbMerges) + + +if __name__ == "__main__": + description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") + parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + c = Clusterize(options.verbosity) + c.setInputFile(options.inputFileName, options.format) + c.setOutputFileName(options.outputFileName, options.outputFormat) + c.setColinear(options.colinear) + c.setDistance(options.distance) + c.setNormalize(options.normalize) + c.setPresorted(options.sorted) + c.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/clusterizeBySlidingWindows.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/clusterizeBySlidingWindows.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,344 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +from commons.core.writer.WriterChooser import WriterChooser +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +import os, os.path +from optparse import OptionParser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.Gff3Writer import Gff3Writer + +class ClusterizeBySlidingWindows(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.strands = (0, ) + self.normalize = False + self.plot = None + self.excel = None + self.outputFileName = '' + self.defaultValue = None + + def __del__(self): + pass + + def setInputFile(self, fileName, format): + self.parser = TranscriptContainer(fileName, format, self.verbosity) + + def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"): + writerChooser = WriterChooser(self.verbosity) + writerChooser.findFormat(format) + self.writer = writerChooser.getWriter(fileName) + self.writer.setTitle(title) + self.writer.setFeature(feature) + self.writer.setFeaturePart(featurePart) +# self.outputFileName = fileName +# self.outputFormat = format + + def setWindowSize(self, size): + self.size = size + + def setWindowOverlap(self, overlap): + self.overlap = overlap + + def setTag(self, tag): + self.tag = tag + + def setOperation(self, operation): + self.operation = operation + + def setBothStrands(self, bothStrands): + if bothStrands: + self.strands = (-1, 1) + + def setNormalize(self, normalize): + self.normalize = normalize + + def setPlot(self, plot): + self.plot = plot + + def setExcel(self, excel): + self.excel = excel + + def setOutputTag(self, tag): + self.outputTagName = tag + + def setDefaultValue(self, defaultValue): + self.defaultValue = defaultValue + + def checkOptions(self): +# if self.operation != None: +# raise Exception("Trying to combine the values without specifying tag! Aborting...") + if self.operation != None and self.operation not in ("sum", "avg", "med", "min", "max"): + raise Exception("Do not understand tag '%s'! Aborting..." % (self.operation)) + + def getChromosomeSizes(self): + self.sizes = {} + progress = Progress(self.parser.getNbTranscripts(), "Getting sizes in genome", self.verbosity) + for transcript in self.parser.getIterator(): + self.sizes[transcript.getChromosome()] = max(transcript.getStart(), self.sizes.get(transcript.getChromosome(), 0)) + progress.inc() + progress.done() + + def getBinsFromPos(self, pos): + bin = (pos - 1) / (self.size - self.overlap) + if bin >= 1 and pos <= bin * (self.size - self.overlap) + self.overlap: + return (bin - 1, bin) + return (bin, ) + + def getPosFromBin(self, bin): + return (bin * (self.size - self.overlap) + 1, bin * (self.size - self.overlap) + self.size) + + def initializeBins(self): + self.binsPerStrand = {} + self.sumsPerStrand = {} + self.valuesPerStrand = {} + self.toBePlottedPerStrand = {} + for strand in self.strands: + self.binsPerStrand[strand] = {} + self.sumsPerStrand[strand] = {} + self.valuesPerStrand[strand] = {} + self.toBePlottedPerStrand[strand] = {} + for chromosome in self.sizes: + binRange = range(self.getBinsFromPos(self.sizes[chromosome])[-1] + 1) + self.binsPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange]) + self.sumsPerStrand[strand][chromosome] = dict([[i, 0.0] for i in binRange]) + self.valuesPerStrand[strand][chromosome] = dict([[i, []] for i in binRange]) + self.toBePlottedPerStrand[strand][chromosome] = dict([[i, 0] for i in binRange]) + + def getNbElements(self, transcript): + nbOccurrences = 1 if "nbOccurrences" not in transcript.getTagNames() else transcript.getTagValue("nbOccurrences") + nbElements = 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements") + nbOccurrences = float(nbOccurrences) + nbElements = float(nbElements) + nbElements /= float(nbOccurrences) + return nbElements + + def setBins(self): + progress = Progress(self.parser.getNbTranscripts(), "Setting bins", self.verbosity) + for transcript in self.parser.getIterator(): + nbElements = self.getNbElements(transcript) + strand = transcript.getDirection() if len(self.strands) == 2 else 0 + for bin in self.getBinsFromPos(transcript.getStart()): + self.binsPerStrand[strand][transcript.getChromosome()][bin] += nbElements + if self.tag != None: + if self.tag not in transcript.getTagNames(): + if self.defaultValue is None: + raise Exception("Tag %s undefined in transcript %s" % (self.tag, transcript)) + value = self.defaultValue + else: + value = float(transcript.getTagValue(self.tag)) + self.sumsPerStrand[strand][transcript.getChromosome()][bin] += value + self.valuesPerStrand[strand][transcript.getChromosome()][bin].append(value) + progress.inc() + progress.done() + + def aggregateData(self): + if self.operation == "sum": + self.computeSumData() + elif self.operation == "avg": + self.computeAvgData() + elif self.operation == "med": + self.computeMedData() + elif self.operation == "min": + self.computeMinData() + elif self.operation == "max": + self.computeMaxData() + elif self.operation == "GCpercent": + self.computeGCPercent() + else: + self.toBePlottedPerStrand = self.binsPerStrand + + def computeSumData(self): + self.toBePlottedPerStrand = self.sumsPerStrand + + def computeAvgData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.binsPerStrand[strand][chromosome][bin] != 0: + self.toBePlottedPerStrand[strand][chromosome][bin] = float(self.sumsPerStrand[strand][chromosome][bin]) / self.binsPerStrand[strand][chromosome][bin] + + def computeMedData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + self.valuesPerStrand[strand][chromosome][bin].sort() + size = len(self.valuesPerStrand[strand][chromosome][bin]) + if size % 2 == 1: + self.toBePlottedPerStrand[strand][chromosome][bin] = self.valuesPerStrand[strand][chromosome][bin][(size - 1) / 2] + else: + self.toBePlottedPerStrand[strand][chromosome][bin] = (self.valuesPerStrand[strand][chromosome][bin][size / 2 - 1] + self.valuesPerStrand[strand][chromosome][bin][size / 2]) / 2.0 + + def computeMinData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + self.toBePlottedPerStrand[strand][chromosome][bin] = min(self.valuesPerStrand[strand][chromosome][bin]) + + def computeMaxData(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + self.toBePlottedPerStrand[strand][chromosome][bin] = max(self.valuesPerStrand[strand][chromosome][bin]) + + def computeGCPercent(self): + for strand in self.strands: + for chromosome in self.binsPerStrand[strand]: + for bin in self.binsPerStrand[strand][chromosome]: + if self.valuesPerStrand[strand][chromosome][bin]: + subSequence = self.valuesPerStrand[strand][chromosome][bin] + NPercent = 100 * (subSequence.countNt("N") / float(subSequence.getSize())) + if NPercent >= 50: + currentGCpercent = "NA" + else: + currentGCpercent = subSequence.getGCpercentageInSequenceWithoutCountNInLength() + + self.toBePlottedPerStrand[strand][chromosome][bin] = currentGCpercent + #TODO: see if a map method could be used for the various "compute" methods + #return currentGCpercent, NPercent + + def plotData(self): + if self.plot != None: + for strand in self.strands: + adjunct = "" + if strand != 0: + adjunct = "Strand%d" % (strand) + for chromosome in self.toBePlottedPerStrand[strand]: + if len(self.toBePlottedPerStrand[strand][chromosome].keys()) > 0: + plotter = RPlotter(self.plot, self.verbosity) + plotter.setFill(0) + plotter.addLine(self.toBePlottedPerStrand[strand][chromosome], chromosome) + plotter.plot() + + def writeExcel(self): + if self.excel != None: + excelFile = open(self.excel, "w") + for strand in self.strands: + maxBin = max([max(self.toBePlottedPerStrand[strand][chromosome].keys()) for chromosome in self.binsPerStrand[strand]]) + for bin in range(0, maxBin + 1): + excelFile.write(",%d-%d" % self.getPosFromBin(bin)) + excelFile.write("\n") + for chromosome in self.toBePlottedPerStrand[strand]: + excelFile.write("%s" % (chromosome)) + for bin in self.toBePlottedPerStrand[strand][chromosome]: + excelFile.write(",%f" % (self.toBePlottedPerStrand[strand][chromosome][bin])) + excelFile.write("\n") + excelFile.close() + + def printRegions(self): + cpt = 1 + tagOp = "nb" + tagName = "Elements" + outputTagName = "nbElements" + if self.operation != None: + tagOp = self.operation.lower() + if self.tag != None: + tagName = self.tag.title() + if self.outputTagName != None: + outputTagName = self.outputTagName + + + #writer = Gff3Writer(self.outputFileName, self.verbosity) + + for strand in self.strands: + for chromosome in self.toBePlottedPerStrand[strand]: + for bin in self.toBePlottedPerStrand[strand][chromosome]: + transcript = Transcript() + transcript.setName("region%d" % cpt) + transcript.setChromosome(chromosome) + transcript.setStart(self.getPosFromBin(bin)[0]) + transcript.setEnd(self.getPosFromBin(bin)[1]) + transcript.setDirection(1 if strand == 0 else strand) + transcript.setTagValue(outputTagName, self.binsPerStrand[strand][chromosome][bin]) + transcript.setTagValue("%s%s" % (tagOp, tagName), str(self.toBePlottedPerStrand[strand][chromosome][bin])) + self.writer.addTranscript(transcript) + cpt += 1 + self.writer.close() + + def run(self): + self.checkOptions() + self.getChromosomeSizes() + self.initializeBins() + self.setBins() + self.aggregateData() + if self.excel: + self.writeExcel() + if self.plot: + self.plotData() + self.printRegions() + + +if __name__ == "__main__": + + # parse command line + description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") + parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="format of the output file [format: transcript file format]") + parser.add_option("-s", "--size", dest="size", action="store", type="int", help="size of the regions [compulsory] [format: int]") + parser.add_option("-e", "--overlap", dest="overlap", action="store", type="int", help="overlap between two consecutive regions [compulsory] [format: int]") + parser.add_option("-m", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="use a given tag as input (instead of summing number of features) [format: string]") + parser.add_option("-r", "--operation", dest="operation", action="store", default=None, type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]") + parser.add_option("-d", "--defaultValue",dest="defaultValue", action="store", type="float", help="default value for input tag [format: float]") + parser.add_option("-w", "--write", dest="writeTag", action="store", default=None, type="string", help="print the result in the given tag (default usually is 'nbElements') [format: string]") + parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]") + parser.add_option("-p", "--plot", dest="plot", action="store", default=None, type="string", help="plot regions to the given file [format: output file in PNG format]") + parser.add_option("-x", "--excel", dest="excel", action="store", default=None, type="string", help="write an Excel file to the given file [format: output file in Excel format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + cbsw = ClusterizeBySlidingWindows(options.verbosity) + cbsw.setInputFile(options.inputFileName, options.inputFormat) + cbsw.setOutputFileName(options.outputFileName, options.outputFormat) + cbsw.setWindowSize(options.size) + cbsw.setWindowOverlap(options.overlap) + cbsw.setTag(options.tag) + cbsw.setDefaultValue(options.defaultValue) + cbsw.setOperation(options.operation) + cbsw.setOutputTag(options.writeTag) + cbsw.setBothStrands(options.strands) + cbsw.setPlot(options.plot) + cbsw.setExcel(options.excel) + cbsw.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/compareOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/compareOverlapping.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,126 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Compare overlap of two transcript lists""" +import sys +import os +from optparse import OptionParser +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.misc.RPlotter import RPlotter +from commons.core.writer.Gff3Writer import Gff3Writer + +class CompareOverlapping(object): + + def __init__(self): + self._options = None + + + def setAttributesFromCmdLine(self): + description = "Compare Overlapping v1.0.3: Get the data which overlap with a reference set. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") + parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") + parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") + parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") + parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") + parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") + parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") + parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") + parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=None, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") + parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (self._options, args) = parser.parse_args() + + + def run(self): + logHandle = None + if self._options.log: + logHandle = open(self._options.output, "w") + + transcriptContainer1 = TranscriptContainer(self._options.inputFileName1, self._options.format1, self._options.verbosity) + transcriptContainer2 = TranscriptContainer(self._options.inputFileName2, self._options.format2, self._options.verbosity) + writer = TranscriptWriter(self._options.output, "gff3", self._options.verbosity) + + transcriptListComparator = TranscriptListsComparator(logHandle, self._options.verbosity) + transcriptListComparator.restrictToStart(transcriptListComparator.QUERY, self._options.start1) + transcriptListComparator.restrictToStart(transcriptListComparator.REFERENCE, self._options.start2) + transcriptListComparator.restrictToEnd(transcriptListComparator.QUERY, self._options.end1) + transcriptListComparator.restrictToEnd(transcriptListComparator.REFERENCE, self._options.end2) + transcriptListComparator.extendFivePrime(transcriptListComparator.QUERY, self._options.fivePrime1) + transcriptListComparator.extendFivePrime(transcriptListComparator.REFERENCE, self._options.fivePrime2) + transcriptListComparator.extendThreePrime(transcriptListComparator.QUERY, self._options.threePrime1) + transcriptListComparator.extendThreePrime(transcriptListComparator.REFERENCE, self._options.threePrime2) + transcriptListComparator.acceptIntrons(transcriptListComparator.QUERY, self._options.introns) + transcriptListComparator.acceptIntrons(transcriptListComparator.REFERENCE, self._options.introns) + transcriptListComparator.getAntisenseOnly(self._options.antisense) + transcriptListComparator.getColinearOnly(self._options.colinear) + transcriptListComparator.getInvert(self._options.exclude) + transcriptListComparator.setMaxDistance(self._options.distance) + transcriptListComparator.setMinOverlap(self._options.minOverlap) + transcriptListComparator.setPcOverlap(self._options.pcOverlap) + transcriptListComparator.setIncludedOnly(self._options.included) + transcriptListComparator.setIncludingOnly(self._options.including) + transcriptListComparator.includeNotOverlapping(self._options.notOverlapping) + transcriptListComparator.computeOdds(True) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer1) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, transcriptContainer2) + transcriptListComparator.setOutputWriter(writer) + transcriptListComparator.compareTranscriptList() + + if self._options.log: + logHandle.close() + + if not self._options.exclude: + odds = transcriptListComparator.getOdds() + if self._options.verbosity > 0 and odds: + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(odds) + +if __name__ == "__main__": + icompareOverlapping = CompareOverlapping() + icompareOverlapping.setAttributesFromCmdLine() + icompareOverlapping.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/convertTranscriptFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/convertTranscriptFile.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a transcript file and convert it to another format +""" + +import os, re +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress + + +class ConvertTranscriptFile(object): + def __init__(self,inputFileName="", inputFormat ="", outputFileName="", outputFormat="", name="", sequenceFileName=None, strands=False, galaxy=False, feature=None, featurePart=None, verbosity=1): + self.inputFileName = inputFileName + self.inputFormat = inputFormat + self.outputFileName = outputFileName + self.outputFormat = outputFormat + self.name = name + self.sequenceFileName = sequenceFileName + self.strands = strands + self.galaxy = galaxy + + self.feature=feature + self.featurePart=featurePart + + self.verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "Convert Transcript File v1.0.3: Convert a file from a format to another. [Category: Conversion]" + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript or mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in format given by -g]") + parser.add_option("-g", "--outputFormat", dest="outputFormat", action="store", type="string", help="format of the output file [compulsory] [format: transcript file format]") + parser.add_option("-n", "--name", dest="name", action="store", default="SMART", type="string", help="name for the transcripts [format: string] [default: SMART]") + parser.add_option("-s", "--sequences", dest="sequenceFileName", action="store", default=None, type="string", help="give the corresponding Multi-Fasta file (useful for EMBL format) [format: string]") + parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="consider the 2 strands as different (only useful for writing WIG files) [format: bool] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-G", "--galaxy", dest="galaxy", action="store_true", default=False, help="used for galaxy [format: bool] [default: False]") + (options, args) = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.inputFileName = options.inputFileName + self.inputFormat = options.inputFormat + self.outputFileName = options.outputFileName + self.outputFormat = options.outputFormat + self.name = options.name + self.sequenceFileName = options.sequenceFileName + self.strands = options.strands + self.galaxy = options.galaxy + self.verbosity = options.verbosity + + def run(self): + # create parser + parser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) + # create writer + writer = TranscriptWriter(self.outputFileName, self.outputFormat, self.verbosity) + # connect parser and writer + writer.setContainer(parser) + + if self.name != None: + writer.setTitle(self.name) + if self.feature != None: + writer.setFeature(self.feature) + if self.featurePart != None: + writer.setFeaturePart(self.featurePart) + if self.sequenceFileName != None: + writer.addSequenceFile(self.sequenceFileName) + + nbItems = 0 + if self.verbosity > 0: + nbItems = parser.getNbItems() + print "%i items found" % (nbItems) + + if self.strands: + writer.setStrands(True) + # convert + writer.write() + writer.close() + +if __name__ == "__main__": + iConvertTranscriptFile = ConvertTranscriptFile() + iConvertTranscriptFile.setAttributesFromCmdLine() + iConvertTranscriptFile.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/coordinatesToSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/coordinatesToSequence.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,64 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a list of coordinates to sequences""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Coordinates to Sequences v1.0.2: Extract the sequences from a list of coordinates. [Category: Conversion]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--sequences", dest="sequences", action="store", type="string", help="file that contains the sequences [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file (FASTA format) [format: output file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parser + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + sequenceParser = FastaParser(options.sequences, options.verbosity) + + writer = FastaWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + sequence = transcript.extractSequence(sequenceParser) + writer.addSequence(sequence) + progress.inc() + progress.done() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/findTss.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/findTss.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,77 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Find TSS from short reads""" +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer + +if __name__ == "__main__": + + # parse command line + description = "Find TSS v1.0.1: Find the transcription start site of a list of transcripts. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=10, type="int", help="distance between two reads to mark the same TSS [format: int] [default: 10]") + parser.add_option("-e", "--colinear", dest="colinear", action="store_true", default=False, help="group by strand [format: bool] [default: false]") + parser.add_option("-c", "--csv", dest="csv", action="store", default=None, type="string", help="output a CSV file in the given path [format: output file in Excel format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + transcriptContainer = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + transcriptListComparator = TranscriptListsComparator(None, options.verbosity) + transcriptListComparator.restrictToStart(transcriptListComparator.QUERY, 1) + transcriptListComparator.setMaxDistance(options.distance) + transcriptListComparator.aggregate(True) + transcriptListComparator.computeOdds(True) + transcriptListComparator.getColinearOnly(options.colinear) + transcriptListComparator.setNormalization(options.normalize) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer) + transcriptListComparator.setOutputWriter(Gff3Writer(options.output, options.verbosity)) + transcriptListComparator.compareTranscriptListSelfMerge() + + if options.csv != None: + csvResults = transcriptListComparator.getOddsPerTranscript() + csvFile = open(options.csv, "w") + csvFile.write("Number,Transcript\n") + for number in sorted(list(set(csvResults.values()))): + csvFile.write("%d," % (number)) + for name in csvResults: + if csvResults[name] == number: + csvFile.write("%s " % (name)) + csvFile.write("\n") + csvFile.close() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/fold.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/fold.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,95 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a mapping file (many formats supported) and select some of them +Mappings should be sorted by read names +""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.toolLauncher.RnaFoldLauncher import RnaFoldLauncher +from commons.core.writer.Gff3Writer import Gff3Writer + + +class Fold(object): + """ + Fold a series of transcripts + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.rnaFoldLauncher = RnaFoldLauncher(verbosity) + self.gff3Writer = None + + + def setInputFileName(self, fileName, format): + transcriptContainer = TranscriptContainer(fileName, format, options.verbosity) + self.rnaFoldLauncher.setTranscriptList(transcriptContainer) + + + def setOutputFileName(self, fileName): + self.gff3Writer = Gff3Writer("%s.gff3" % (fileName), self.verbosity) + + + def setGenomeFileName(self, fileName): + self.rnaFoldLauncher.setGenomeFile(fileName) + + + def setExtensions(self, fivePrime, threePrime): + self.rnaFoldLauncher.setExtensions(fivePrime, threePrime) + + + def start(self): + self.gff3Writer.addTranscriptList(self.rnaFoldLauncher.getResults()) + + + +if __name__ == "__main__": + + # parse command line + description = "Fold v1.0.1: Fold a list of transcript and give the energy. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-g", "--genome", dest="genomeFileName", action="store", type="string", help="genome file name [format: file in FASTA format]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store", type="int", help="extend towards the 5' end [format: int]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store", type="int", help="extend towards the 3' end [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + folder = Fold(options.verbosity) + folder.setInputFileName(options.inputFileName, options.format) + folder.setOutputFileName(options.outputFileName) + folder.setExtensions(options.fivePrime, options.threePrime) + folder.setGenomeFileName(options.genomeFileName) + folder.start() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getDifference.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getDifference.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,155 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a transcript list with some parameters (regions)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.misc.Progress import Progress + +class DifferenceGetter(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.annotationParser = None + self.referenceParser = None + self.sequenceParser = None + self.transcriptCount = 1 + self.split = False + + def createTranscript(self, chromosome, start, end): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setDirection("+") + transcript.setStart(start) + transcript.setEnd(end) + transcript.setName("region_%d" % self.transcriptCount) + transcript.setTagValue("ID", "region_%d" % self.transcriptCount) + self.transcriptCount += 1 + return transcript + + def setSplit(self, split): + self.split = split + + def setAnnotationFile(self, fileName, format): + if fileName != None: + self.annotationParser = TranscriptContainer(fileName, format, self.verbosity) + + def setReferenceFile(self, fileName, format): + if fileName != None: + self.referenceParser = TranscriptContainer(fileName, format, self.verbosity) + + def setSequenceFile(self, fileName): + if fileName != None: + self.sequenceParser = FastaParser(fileName, self.verbosity) + + def setOutputFile(self, fileName): + self.writer = Gff3Writer(fileName, self.verbosity) + + def initialize(self): + self.presence = {} + for chromosome in self.sequenceParser.getRegions(): + self.presence[chromosome] = [[1, self.sequenceParser.getSizeOfRegion(chromosome)]] + + def readTranscripts(self): + nbTranscripts = self.annotationParser.getNbTranscripts() + progress = Progress(nbTranscripts, "Parsing annotation file" , self.verbosity) + for transcript in self.annotationParser.getIterator(): + chromosome = transcript.getChromosome() + toBeDeleted = [] + toBeAppended = [] + for i, element in enumerate(self.presence[chromosome]): + start, end = element + if start <= transcript.getEnd() and transcript.getStart() <= end: + toBeDeleted.append(i) + if start < transcript.getStart(): + toBeAppended.append([start, transcript.getStart() - 1]) + if end > transcript.getEnd(): + toBeAppended.append([transcript.getEnd() + 1, end]) + for i in reversed(toBeDeleted): + del self.presence[chromosome][i] + self.presence[chromosome].extend(toBeAppended) + progress.inc() + progress.done() + + def writeOutput(self): + for chromosome in self.presence: + for element in self.presence[chromosome]: + start, end = element + self.writer.addTranscript(self.createTranscript(chromosome, start, end)) + self.writer.write() + + def compareToSequence(self): + self.initialize() + self.readTranscripts() + self.writeOutput() + + def compareToAnnotation(self): + transcriptListComparator = TranscriptListsComparator(None, self.verbosity) + transcriptListComparator.setSplitDifference(self.split) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, self.annotationParser) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, self.referenceParser) + transcriptListComparator.setOutputWriter(self.writer) + transcriptListComparator.getDifferenceTranscriptList() + + def run(self): + if self.referenceParser != None: + self.compareToAnnotation() + else: + self.compareToSequence() + + +if __name__ == "__main__": + + # parse command line + description = "Get Difference v1.0.1: Get all the regions of the genome, except the one given or get all the elements from the first set which does not ovelap with the second set (at the nucleotide level). [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default=None, type="string", help="reference file [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", default=None, type="string", help="format of the reference file [format: transcript file format]") + parser.add_option("-s", "--sequence", dest="sequenceFileName", action="store", default=None, type="string", help="sequence file [format: file in FASTA format]") + parser.add_option("-p", "--split", dest="split", action="store_true", default=False, help="when comparing to a set of genomic coordinates, do not join [format: boolean] [default: False") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + getter = DifferenceGetter(options.verbosity) + getter.setSplit(options.split) + getter.setAnnotationFile(options.inputFileName1, options.format1) + getter.setSequenceFile(options.sequenceFileName) + getter.setReferenceFile(options.inputFileName2, options.format2) + getter.setOutputFile(options.outputFileName) + getter.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getDistance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getDistance.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,241 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the distance between the transcripts of two lists""" + +import os +import sys +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from commons.core.writer.Gff3Writer import Gff3Writer + +class GetDistance(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.writer = None + self.spearman = False + self.tlc = TranscriptListsComparator(None, self.verbosity) + self.strands = (0, ) + self.buckets = None + self.title = "" + self.xMin = None + self.xMax = None + self.proportion = False + self.outputFileName = None + self.keep = False + + def __del__(self): + pass + + def setQueryFile(self, fileName, format): + self.transcriptContainer1 = TranscriptContainer(fileName, format, self.verbosity) + + def setReferenceFile(self, fileName, format): + self.transcriptContainer2 = TranscriptContainer(fileName, format, self.verbosity) + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + def setOutputTranscriptFile(self, fileName): + if fileName != None: + self.writer = Gff3Writer(fileName, self.verbosity) + + def restrictQueryToStart(self, number): + self.tlc.restrictToStart(self.tlc.QUERY, number) + + def restrictReferenceToStart(self, number): + self.tlc.restrictToStart(self.tlc.REFERENCE, number) + + def restrictQueryToEnd(self, number): + self.tlc.restrictToEnd(self.tlc.QUERY, number) + + def restrictReferenceToEnd(self, number): + self.tlc.restrictToEnd(self.tlc.REFERENCE, number) + + def setAbsolute(self, boolean): + self.tlc.setAbsolute(boolean) + + def setProportion(self, boolean): + self.proportion = boolean + + def setColinear(self, boolean): + self.tlc.getColinearOnly(boolean) + + def setAntisense(self, boolean): + self.tlc.getAntisenseOnly(boolean) + + def setDistances(self, minDistance, maxDistance): + self.tlc.setMinDistance(minDistance) + self.tlc.setMaxDistance(maxDistance) + + def setStrands(self, boolean): + self.tlc.setStrandedDistance(boolean) + if boolean: + self.strands = (-1, 1) + + def setUpstream(self, number): + self.tlc.setUpstream(self.tlc.REFERENCE, number) + + def setDownstream(self, number): + self.tlc.setDownstream(self.tlc.REFERENCE, number) + + def setBuckets(self, number): + self.buckets = number + + def setTitle(self, title): + self.title = title + + def setXValues(self, xMin, xMax): + self.xMin, self.xMax = xMin, xMax + + def keepTmpValues(self, boolean): + self.keep = boolean + + def getSpearman(self, boolean): + self.spearman = True + + def compare(self): + self.tlc.setInputTranscriptContainer(self.tlc.QUERY, self.transcriptContainer1) + self.tlc.setInputTranscriptContainer(self.tlc.REFERENCE, self.transcriptContainer2) + self.tlc.setOutputWriter(self.writer) + self.distances = self.tlc.compareTranscriptListDistance() + + def checkEmptyDistances(self): + return (sum([len(self.distances[strand].keys()) for strand in self.strands]) == 0) + + def setPlotterMinusStrand(self): + if -1 in self.strands: + for x, y in self.distances[-1].iteritems(): + self.distances[-1][x] = -y + + def setPlotterProportion(self): + if not self.proportion: + return + self.nbElements = sum([abs(sum(self.distances[strand].values())) for strand in self.strands]) + for strand in self.strands: + self.distances[strand] = dict([(distance, float(nb) / self.nbElements * 100) for distance, nb in self.distances[strand].iteritems()]) + + def setPlotter(self): + self.plotter = RPlotter(self.outputFileName, self.verbosity, self.keep) + if self.buckets != None: + self.plotter.setBarplot(True) + self.plotter.setFill(0) + self.plotter.setXLabel("distance") + self.plotter.setYLabel("# elements") + if self.proportion: + self.plotter.setYLabel("%% elements (%d in toto)" % (self.nbElements)) + self.plotter.setBuckets(self.buckets) + self.plotter.setMinimumX(self.xMin) + self.plotter.setMaximumX(self.xMax) + self.plotter.setTitle(self.title) + + def plot(self): + if len(self.strands) == 1: + self.distances = {0: self.distances} + if self.checkEmptyDistances(): + print "No output." + sys.exit() + self.setPlotterMinusStrand() + self.setPlotterProportion() + if self.outputFileName == None: + return + self.setPlotter() + for strand in self.strands: + self.plotter.addLine(self.distances[strand]) + self.plotter.plot() + + def printSpearman(self): + if self.spearman: + print "Spearman's rho: %.5f" % (self.plotter.getSpearmanRho()) + + def run(self): + self.compare() + self.plot() + self.printSpearman() + +if __name__ == "__main__": + + # parse command line + description = "Get Distance v1.0.3: Compute the distance of a set of transcript with respect to a reference set. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="plot output file [format: output file in PNG format]") + parser.add_option("-O", "--outputDistances", dest="outputDistances", action="store", default=None, type="string", help="output file containing the distance for each element of the query [format: output file in GFF3 format] [default: None]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="only consider features on the same strand [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="only consider features on the opposite strand [format: bool] [default: false]") + parser.add_option("-b", "--absolute", dest="absolute", action="store_true", default=False, help="give the absolute value of the distance [format: bool] [default: false]") + parser.add_option("-p", "--proportion", dest="proportion", action="store_true", default=False, help="give the proportion on the y-axis instead of the number of distances [format: bool] [default: false]") + parser.add_option("-s", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first 5' nucleotides for list 1 [format: int]") + parser.add_option("-S", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first 5' nucleotides for list 2 [format: int]") + parser.add_option("-e", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last 3' nucleotides for list 1 [format: int]") + parser.add_option("-E", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last 3' nucleotides for list 2 [format: int]") + parser.add_option("-m", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance considered between two transcripts [format: int] [default: None]") + parser.add_option("-M", "--maxDistance", dest="maxDistance", action="store", default=1000, type="int", help="maximum distance considered between two transcripts [format: int] [default: 1000]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store_true", default=False, help="consider the elements from list 1 which are upstream of elements of list 2 [format: bool] [default: False]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store_true", default=False, help="consider the elements from list 1 which are downstream of elements of list 2 [format: bool] [default: False]") + parser.add_option("-u", "--buckets", dest="buckets", action="store", default=None, type="int", help="plot histogram instead of line plot with given interval size [format: int] [default: None]") + parser.add_option("-2", "--2strands", dest="twoStrands", action="store_true", default=False, help="plot the distributions of each strand separately [format: bool] [default: False]") + parser.add_option("-r", "--spearman", dest="spearman", action="store_true", default=False, help="compute Spearman rho [format: bool] [default: False]") + parser.add_option("-x", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int] [default: None]") + parser.add_option("-X", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int] [default: None]") + parser.add_option("-t", "--title", dest="title", action="store", default=None, type="string", help="title for the graph [format: int] [default: None]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: bool]") + (options, args) = parser.parse_args() + + gd = GetDistance(options.verbosity) + gd.setQueryFile(options.inputFileName1, options.format1) + gd.setReferenceFile(options.inputFileName2, options.format2) + gd.setOutputFile(options.outputFileName) + gd.setOutputTranscriptFile(options.outputDistances) + gd.setColinear(options.colinear) + gd.setAntisense(options.antisense) + gd.setAbsolute(options.absolute) + gd.setProportion(options.proportion) + gd.restrictQueryToStart(options.start1) + gd.restrictReferenceToStart(options.start2) + gd.restrictQueryToEnd(options.end1) + gd.restrictReferenceToEnd(options.end2) + gd.setDistances(options.minDistance, options.maxDistance) + gd.setUpstream(options.fivePrime) + gd.setDownstream(options.threePrime) + gd.setStrands(options.twoStrands) + gd.setBuckets(options.buckets) + gd.setTitle(options.title) + gd.setXValues(options.xMin, options.xMax) + gd.keepTmpValues(options.keep) + gd.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,291 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the repartition of some elements in a chromosomes""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from math import * + +def divideKeyDict(dictionary, ratio): + return dict([(key / ratio, dictionary[key]) for key in dictionary]) + + +def setTranscript(chromosome, direction, start, end, name, value): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setDirection(direction) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setName(name) + transcript.setTagValue("nbElements", value) + return transcript + + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") + parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") + parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + (options, args) = parser.parse_args() + + sizes = {} + if options.referenceFileName != None: + # get the sizes of the chromosomes + referenceHandle = open(options.referenceFileName) + name = None + size = 0 + maxSize = 0 + for line in referenceHandle: + line = line.strip() + if line == "": continue + if line[0] == ">": + if name != None: + if options.verbosity > 10: + print name + sizes[name] = size + maxSize = max(maxSize, size) + size = 0 + name = line[1:] + else: + size += len(line) + sizes[name] = size + maxSize = max(maxSize, size) + if options.verbosity > 1: + print "done" + start = 0 + end = maxSize + else: + if options.chromosome == None or options.start == None or options.end == None: + raise Exception("Missing chromosome or start and end positions, or reference file") + maxSize = options.end + sizes[options.chromosome] = options.end + start = options.start + end = options.end + + + tmp1 = int(maxSize / float(options.nbBins)) + tmp2 = 10 ** (len("%d" % (tmp1))-2) + sliceSize = int((tmp1 / tmp2) * tmp2) + + bins = dict() + binsPlus = dict() + binsMinus = dict() + for chromosome in sizes: + bins[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + binsPlus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + binsMinus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + maxSlice = 0 + # count the number of reads + for transcript in parser.getIterator(): + if options.chromosome == None or (transcript.getChromosome() == options.chromosome and transcript.getStart() >= start and transcript.getStart() <= end): + if transcript.getDirection() == 1: + binsPlus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + else: + binsMinus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + bins[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + maxSlice = max(maxSlice, transcript.getStart() / sliceSize) + progress.inc() + progress.done() + + # compute densities + densityPlus = dict() + for chromosome in bins: + densityPlus[chromosome] = dict([(bin, 0) for bin in binsPlus[chromosome]]) + for bin in binsPlus[chromosome]: + densityPlus[chromosome][bin] = float(binsPlus[chromosome][bin]) / sliceSize * magnifyingFactor + # correct densities for first and last bins + if start % sliceSize != 0: + densityPlus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor + if sizes[chromosome] % sliceSize != 0: + densityPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor + densityMinus = dict() + for chromosome in binsMinus: + densityMinus[chromosome] = dict([(bin, 0) for bin in binsMinus[chromosome]]) + for bin in binsMinus[chromosome]: + densityMinus[chromosome][bin] = float(binsMinus[chromosome][bin]) / sliceSize * magnifyingFactor + # correct densities for first and last bins + if start % sliceSize != 0: + densityMinus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor + if sizes[chromosome] % sliceSize != 0: + densityMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor + density = dict() + for chromosome in bins: + density[chromosome] = dict([(bin, 0) for bin in bins[chromosome]]) + for bin in bins[chromosome]: + density[chromosome][bin] = densityPlus[chromosome][bin] + densityMinus[chromosome][bin] + + for chromosome in densityMinus: + for bin in densityMinus[chromosome]: + densityMinus[chromosome][bin] *= -1 + for bin in binsMinus[chromosome]: + binsMinus[chromosome][bin] *= -1 + + for chromosome in density: + maxX = max(bins[chromosome].keys()) + if maxX <= 1000: + unit = "nt." + ratio = 1.0 + elif maxX <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + outputFileName = "%s_%s" % (options.outputFileName, chromosome) + if options.start != None and options.end != None: + outputFileName += ":%d-%d" % (options.start, options.end) + outputFileName += ".png" + plotter = RPlotter(outputFileName, options.verbosity) + plotter.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plotter.setYLabel("# reads") + if options.bothStrands: + plotter.setImageSize(1000, 300) + else: + plotter.setImageSize(1000, 200) + if options.height != None: + plotter.setHeight(options.height) + if options.width != None: + plotter.setWidth(options.width) + if options.yMax != None: + plotter.setMinimumY(options.yMin) + if options.yMax != None: + plotter.setMaximumY(options.yMax) + if options.bothStrands : + if options.raw: + plotter.addLine(divideKeyDict(binsPlus[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio)) + if options.raw: + plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio)) + else: + if options.raw: + plotter.addLine(divideKeyDict(bins[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(density[chromosome], ratio)) + plotter.plot() + + if options.csv: + outputFileName = "%s" % (options.outputFileName) + if options.chromosome != None: + outputFileName += "_%s" % (options.chromosome) + if options.start != None and options.end != None: + outputFileName += ":%d-%d" % (options.start, options.end) + outputFileName += ".csv" + csvHandle = open(outputFileName, "w") + for slice in range(start / sliceSize, maxSlice + 1): + csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize)) + csvHandle.write("\n") + if options.bothStrands: + for chromosome in densityPlus: + if len(densityPlus[chromosome]) > 0: + csvHandle.write("%s [+]" % (chromosome)) + for slice in sorted(densityPlus[chromosome].keys()): + csvHandle.write(";%.2f" % (densityPlus[chromosome][slice])) + csvHandle.write("\n") + if len(densityMinus[chromosome]) > 0: + csvHandle.write("%s [-]" % (chromosome)) + for slice in sorted(densityPlus[chromosome].keys()): + csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice])) + csvHandle.write("\n") + else: + for chromosome in density: + if len(density[chromosome]) > 0: + csvHandle.write(chromosome) + for slice in sorted(density[chromosome].keys()): + csvHandle.write(";%.2f" % (density[chromosome][slice])) + csvHandle.write("\n") + csvHandle.close() + + if options.gff: + chromosome = "" if options.chromosome == None else options.chromosome.capitalize() + start = "" if options.start == None else "%d" % (options.start) + end = "" if options.end == None else "%d" % (options.end) + link1 = "" if options.start == None and options.end == None else ":" + link2 = "" if options.start == None and options.end == None else "-" + writer = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity) + cpt = 1 + if options.raw: + valuesPlus = binsPlus + valuesMinus = binsMinus + values = bins + else: + valuesPlus = densityPlus + valuesMinus = densityMinus + values = density + if options.bothStrands: + for chromosome in values: + for slice in valuesPlus[chromosome]: + writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice])) + cpt += 1 + for slice in valuesMinus[chromosome]: + writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice])) + cpt += 1 + else: + for chromosome in values: + for slice in values[chromosome]: + writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice])) + cpt += 1 + writer.write() + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getElement.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getElement.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,106 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the first element (exon / intron) from a list of transcripts""" + +import os +from optparse import OptionParser +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.structure.TranscriptContainer import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Get Element v1.0.1: Get the first element (exon / intron) from a list of transcripts. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-t", "--type", dest="type", action="store", type="string", help="type of the element [format: choice (exon, intron)]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = Gff3Writer(options.outputFileName, options.verbosity) + sqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + + nbLines = parser.getNbTranscripts() + print "%i lines found" % (nbLines) + + # treat transcripts + nbWritten = 0 + nbUsed = 0 + progress = Progress(nbLines, "Analyzing transcripts of " + options.inputFileName, options.verbosity) + for transcript in parser.getIterator(): + + outTranscript = Transcript() + outTranscript.setName(transcript.getName()) + outTranscript.setDirection(transcript.getDirection()) + outTranscript.setChromosome(transcript.getChromosome()) + + if options.type == "exon": + if len(transcript.getExons()) > 1: + transcript.sortExons() + outTranscript.setStart(transcript.getExons()[0].getStart()) + outTranscript.setEnd(transcript.getExons()[0].getEnd()) + writer.addTranscript(outTranscript) + if options.mysql: + sqlWriter.addTranscript(transcript) + nbWritten += 1 + nbUsed += 1 + elif options.type == "intron": + used = False + for intron in transcript.getIntrons(): + used = True + thisTranscript = Transcript() + thisTranscript.copy(outTranscript) + thisTranscript.setStart(intron.getStart()) + thisTranscript.setEnd(intron.getEnd()) + writer.addTranscript(thisTranscript) + if options.mysql: + sqlWriter.addTranscript(transcript) + nbWritten += 1 + if used: + nbUsed += 1 + else: + sys.exit("Cannot understan type %s" % (options.type)) + progress.inc() + progress.done() + + if options.mysql: + sqlWriter.write() + + print "nb sequences used: %d" % (nbUsed) + print "nb elements used: %d" % (nbWritten) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getExons.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getExons.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,128 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +zeroBaseToOneBaseConvertor = (lambda x: x - 1 if x > 0 else x) + +class GetExons(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.selection = False + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setSelection(self, selection): + if selection == None: + return + self.selection = True + self.selectionItems = [] + self.selectionIntervals = [] + for part in selection.split(","): + try: + splittedPart = map(int, part.split("..")) + except Exception: + raise Exception("Elements '" + splittedPart + "' of selection '" + selection + "' do no seem to be integers!") + if len(splittedPart) == 1: + self.selectionItems.append(splittedPart[0]) + elif len(splittedPart) == 2: + self.selectionIntervals.append((splittedPart[0], splittedPart[1])) + else: + raise Exception("Cannot parse elements '" + splittedPart + "' of selection '" + selection + "'!") + + def getSelectionExonIndices(self, nbExons): + if not self.selection: + return range(nbExons) + indices = [] + for item in self.selectionItems: + indices.append(range(nbExons)[zeroBaseToOneBaseConvertor(item)]) + for start, end in self.selectionIntervals: + start, end = map(zeroBaseToOneBaseConvertor, (start, end)) + if end > 0: + end += 1 + indices.extend(range(nbExons)[start:end]) + return indices + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbExons = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + selectedExons = self.getSelectionExonIndices(transcript.getNbExons()) + transcript.sortExons() + for cpt2, exon in enumerate(transcript.getExons()): + if cpt2 not in selectedExons: + continue + exonTranscript = Transcript() + exonTranscript.copy(exon) + if "Parent" in exonTranscript.tags: + del exonTranscript.tags["Parent"] + exonTranscript.tags["feature"] = "transcript" + if "ID" not in exonTranscript.tags or exonTranscript.tags["ID"] == "unnamed transcript": + exonTranscript.tags["ID"] = "exon_%d-%d" % (cpt1+1, cpt2+1) + if exonTranscript.getName() == "unnamed transcript": + exonTranscript.setName("exon_%d-%d" % (cpt1+1, cpt2+1)) + self.writer.addTranscript(exonTranscript) + nbExons += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d exons written" % (nbExons) + +if __name__ == "__main__": + + description = "Get Exons v1.0.1: Get the exons of a set of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--select", dest="select", action="store", default=None, type="string", help="select some of the exons (like '1,2,5..-3,-1') [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + ge = GetExons(options.verbosity) + ge.setInputFile(options.inputFileName, options.format) + ge.setSelection(options.select) + ge.setOutputFile(options.outputFileName) + ge.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getInfoPerCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getInfoPerCoverage.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,167 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Compare overlap of a transcript list and list of read, and get some info depending on the coverage""" + +import os +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import * +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.mySql.MySqlConnection import * +from SMART.Java.Python.structure.TranscriptListsComparator import * +from SMART.Java.Python.misc.RPlotter import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Get Info per Coverage v1.0.1: Get a list of information clustered by the density of the coverage on a genome. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in TXT format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store", default=None, type="string", help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + logHandle = None + if options.log != None: + logHandle = open(options.log, "w") + + transcriptContainer1 = TranscriptContainer(options.inputFileName1, options.format1, options.verbosity) + transcriptContainer2 = TranscriptContainer(options.inputFileName2, options.format2, options.verbosity) + + transcriptListComparator = TranscriptListsComparator(logHandle, options.verbosity) + transcriptListComparator.restrictToStart(transcriptListComparator.REFERENCE, 10) + transcriptListComparator.getColinearOnly(True) + transcriptListComparator.computeOddsPerTranscript(True) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer1) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, transcriptContainer2) + transcriptListComparator.compareTranscriptList() + transcriptTables = transcriptListComparator.getOutputTables() + + sizesWithIntrons = {} + sizesWithoutIntrons = {} + nbExons = {} + averageSizesWithIntrons = {} + averageSizesWithoutIntrons = {} + averageNbExons = {} + sumSizesWithIntrons = {} + sumSizesWithoutIntrons = {} + sumSizesNbExons = {} + coverages = transcriptListComparator.getOddsPerTranscript() + + progress = Progress(transcriptContainer2.getNbTranscripts(), "Reading transcript file again", options.verbosity) + for transcript in transcriptContainer2.getIterator(): + if transcript.name in coverages: + if transcript.getSizeWithIntrons() not in averageSizesWithIntrons: + averageSizesWithIntrons[transcript.getSizeWithIntrons()] = coverages[transcript.name] + else: + averageSizesWithIntrons[transcript.getSizeWithIntrons()] += coverages[transcript.name] + if transcript.getSizeWithIntrons() not in sumSizesWithIntrons: + sumSizesWithIntrons[transcript.getSizeWithIntrons()] = 1 + else: + sumSizesWithIntrons[transcript.getSizeWithIntrons()] += 1 + if transcript.getSize() not in averageSizesWithoutIntrons: + averageSizesWithoutIntrons[transcript.getSize()] = coverages[transcript.name] + else: + averageSizesWithoutIntrons[transcript.getSize()] += coverages[transcript.name] + if transcript.getSize() not in sumSizesWithoutIntrons: + sumSizesWithoutIntrons[transcript.getSize()] = 1 + else: + sumSizesWithoutIntrons[transcript.getSize()] += 1 + if transcript.getNbExons() not in averageNbExons: + averageNbExons[transcript.getNbExons()] = coverages[transcript.name] + else: + averageNbExons[transcript.getNbExons()] += coverages[transcript.name] + if transcript.getNbExons() not in sumSizesNbExons: + sumSizesNbExons[transcript.getNbExons()] = 1 + else: + sumSizesNbExons[transcript.getNbExons()] += 1 + sizesWithIntrons[transcript.name] = (transcript.getSizeWithIntrons(), coverages[transcript.name]) + sizesWithoutIntrons[transcript.name] = (transcript.getSize(), coverages[transcript.name]) + nbExons[transcript.name] = (transcript.getNbExons(), coverages[transcript.name]) + progress.inc() + progress.done() + + plotterSizeWithIntrons = RPlotter("%sWithIntrons.png" % (options.output), options.verbosity) + plotterSizeWithIntrons.setPoints(True) + plotterSizeWithIntrons.setMaximumX(10000) + plotterSizeWithIntrons.setMaximumY(1000) + plotterSizeWithIntrons.setLog("y") + plotterSizeWithIntrons.addLine(sizesWithIntrons) + plotterSizeWithIntrons.plot() + + plotterSizeWithoutIntrons = RPlotter("%sWithoutIntrons.png" % (options.output), options.verbosity) + plotterSizeWithoutIntrons.setPoints(True) + plotterSizeWithoutIntrons.setMaximumX(10000) + plotterSizeWithoutIntrons.setMaximumY(1000) + plotterSizeWithoutIntrons.setLog("y") + plotterSizeWithoutIntrons.addLine(sizesWithoutIntrons) + plotterSizeWithoutIntrons.plot() + + plotterNbExons = RPlotter("%sNbExons.png" % (options.output), options.verbosity) + plotterNbExons.setPoints(True) + plotterNbExons.addLine(nbExons) + plotterNbExons.plot() + + for element in averageSizesWithIntrons: + averageSizesWithIntrons[element] = int(float(averageSizesWithIntrons[element]) / sumSizesWithIntrons[element]) + plotterAverageSizeWithIntrons = RPlotter("%sAverageWithIntrons.png" % (options.output), options.verbosity) + plotterAverageSizeWithIntrons.setMaximumX(10000) + plotterAverageSizeWithIntrons.setMaximumY(1000) + plotterAverageSizeWithIntrons.setLog("y") + plotterAverageSizeWithIntrons.addLine(averageSizesWithIntrons) + plotterAverageSizeWithIntrons.plot() + print "min/avg/med/max sizes with introns: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(averageSizesWithIntrons) + + for element in averageSizesWithoutIntrons: + averageSizesWithoutIntrons[element] = int(float(averageSizesWithoutIntrons[element]) / sumSizesWithoutIntrons[element]) + plotterAverageSizeWithoutIntrons = RPlotter("%sAverageWithoutIntrons.png" % (options.output), options.verbosity) + plotterAverageSizeWithoutIntrons.setMaximumX(10000) + plotterAverageSizeWithoutIntrons.setMaximumY(1000) + plotterAverageSizeWithoutIntrons.setLog("y") + plotterAverageSizeWithoutIntrons.addLine(averageSizesWithoutIntrons) + plotterAverageSizeWithoutIntrons.plot() + print "min/avg/med/max sizes without introns: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(averageSizesWithoutIntrons) + + for element in averageNbExons: + averageNbExons[element] = int(float(averageNbExons[element]) / sumSizesNbExons[element]) + plotterAverageNbExons = RPlotter("%sAverageNbExons.png" % (options.output), options.verbosity) + plotterAverageNbExons.addLine(averageNbExons) + plotterAverageNbExons.plot() + print "min/avg/med/max # exons: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(averageNbExons) + + if options.log: + logHandle.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getIntrons.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getIntrons.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,89 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class GetIntrons(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbIntrons = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + for cpt2, intron in enumerate(transcript.getIntrons()): + intronTranscript = Transcript() + intronTranscript.copy(intron) + if "Parent" in intronTranscript.tags: + del intronTranscript.tags["Parent"] + intronTranscript.tags["feature"] = "transcript" + if "ID" not in intronTranscript.tags or intronTranscript.tags["ID"] == "unnamed transcript": + intronTranscript.tags["ID"] = "intron_%d-%d" % (cpt1+1, cpt2+1) + if intronTranscript.getName() == "unnamed transcript": + intronTranscript.setName("intron_%d-%d" % (cpt1+1, cpt2+1)) + self.writer.addTranscript(intronTranscript) + nbIntrons += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d introns written" % (nbIntrons) + + +if __name__ == "__main__": + + description = "Get Introns v1.0.1: Get the introns of a set of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + gi = GetIntrons(options.verbosity) + gi.setInputFile(options.inputFileName, options.format) + gi.setOutputFile(options.outputFileName) + gi.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getLetterDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getLetterDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,153 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the size distribution of a Fasta / BED file""" + +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * +from commons.core.parsing.ParserChooser import ParserChooser + + +def writeCVSfile(outHandler): + for pos in range(len(letters)): + posTrue = pos +1 + outHandler.write( "%s;" % (posTrue)) + for letter in lettersRate: + if positionRate[letter].has_key(pos): + outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%")) + else: + outHandler.write("%s=0%s;" % (letter, "%")) + outHandler.write("\n") + +if __name__ == "__main__": + + # parse command line + description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-c", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool] [default: false]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + chooser = ParserChooser() + chooser.findFormat(options.format) + parser = chooser.getParser(options.inputFileName) + nbSequences = parser.getNbSequences() + print "%i sequences read" % (nbSequences) + + # treat items + progress = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity) + nbLettersTotal = 0 + nbLetters = {} + lettersRate = {} + nbPositions = {} + positionCount = {} + positionRate = {} + nbPositionRate = {} + for sequence in parser.getIterator(): + letters = sequence.getSequence() + thisNbLettersTotal = sequence.getSize() + nbLettersTotal += thisNbLettersTotal + thisNbLetters = {} + + for pos in range(len(letters)): + letter = letters[pos] + if letter not in thisNbLetters: + thisNbLetters[letter] = 1 + else: + thisNbLetters[letter] += 1 + if pos+1 not in nbPositions: + nbPositions[pos+1] = 1 + else: + nbPositions[pos+1] += 1 + if letter not in positionCount: + positionCount[letter] = {} + if pos+1 not in positionCount[letter]: + positionCount[letter][pos+1] = 1 + else: + positionCount[letter][pos+1] += 1 + + for letter in thisNbLetters: + if letter not in nbLetters: + nbLetters[letter] = thisNbLetters[letter] + else: + nbLetters[letter] += thisNbLetters[letter] + if letter not in lettersRate: + lettersRate[letter] = {} + rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100) + if rate not in lettersRate[letter]: + lettersRate[letter][rate] = 1 + else: + lettersRate[letter][rate] += 1 + progress.inc() + progress.done() + + for letter in positionCount: + positionRate[letter] = {} + for pos in positionCount[letter]: + positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100 + for pos in nbPositions: + nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100 + + # plot content distributions + plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True) + plotter.setFill(0) + plotter.setLegend(True) + for letter in lettersRate: + plotter.addLine(lettersRate[letter], letter) + plotter.plot() + + # plot distribution per position + plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True) + plotter.setFill(0) + plotter.setLegend(True) + plotter.setXLabel("Position on the read") + plotter.setYLabel("Percentage") + for letter in positionRate: + plotter.addLine(positionRate[letter], letter) + plotter.addLine(nbPositionRate, "#") + plotter.plot() + + if options.csv: + outHandler = open("%s.csv" % (options.outputFileName), "w") + writeCVSfile(outHandler) + outHandler.close() + + print "%d sequences" % (nbSequences) + print "%d letters" % (nbLettersTotal) + for letter in nbLetters: + print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getNb.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getNb.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,99 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the repartition of some elements (# exons per transcripts, # of repetitions of a mapping or # of transcripts in a cluster)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils +from math import * + +if __name__ == "__main__": + + # parse command line + description = "Get Nb v1.0.1: Get the distribution of exons per transcripts, or mapping per read, or transcript per cluster. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in png format]") + parser.add_option("-q", "--query", dest="query", action="store", type="string", help="query [compulsory] (# exons, # transcripts) [format: choice (exon, transcript, cluster)]") + parser.add_option("-b", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.query != "exon" and options.query != "transcript" and options.query != "cluster": + raise Exception("Do not understand query %s" % (options.query)) + + exonDistribution = {} + transcriptDistribution = {} + clusterDistribution = {} + + transcriptContainer = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + progress = Progress(transcriptContainer.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + # count the number of reads + for element in transcriptContainer.getIterator(): + if options.query == "exon": + nbExons = element.getNbExons() + exonDistribution[nbExons] = exonDistribution.get(nbExons, 0) + 1 + elif options.query == "transcript": + name = element.getName() + transcriptDistribution[name] = transcriptDistribution.get(name, 0) + 1 + elif options.query == "cluster": + nbElements = 1 if "nbElements" not in element.getTagNames() else element.getTagValue("nbElements") + clusterDistribution[nbElements] = clusterDistribution.get(nbElements, 0) + 1 + progress.inc() + progress.done() + + if options.query == "exon": + distribution = exonDistribution + elif options.query == "transcript": + distribution = {} + for name in transcriptDistribution: + distribution[transcriptDistribution[name]] = distribution.get(transcriptDistribution[name], 0) + 1 + elif options.query == "cluster": + distribution = clusterDistribution + + outputFileName = options.outputFileName + plotter = RPlotter(outputFileName, options.verbosity) + plotter.setImageSize(1000, 300) + plotter.setFill(0) + plotter.setMaximumX(options.xMax) + plotter.setBarplot(options.barplot) + plotter.addLine(distribution) + plotter.plot() + + print "min/avg/med/max: %d/%.2f/%.1f/%d" % (Utils.getMinAvgMedMax(distribution)) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getRandomRegions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getRandomRegions.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,267 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Find random regions in a genome""" + +import random, math +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.writer.Gff3Writer import * +from commons.core.writer.MySqlTranscriptWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer + +repetitions = 100 + + +class RandomRegionsGenerator(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.strands = False + self.distribution = "uniform" + self.transcripts = None + self.sequenceParser = None + random.seed() + + + def setInput(self, fileName): + self.sequenceParser = FastaParser(fileName, self.verbosity) + + + def setGenomeSize(self, size): + self.genomeSize = size + + + def setChromosomeName(self, name): + self.chromosomeName = name + + + def setAnnotation(self, fileName, format): + parser = TranscriptContainer(fileName, format, self.verbosity) + self.transcripts = [] + for transcript in parser.getIterator(): + self.transcripts.append(transcript) + self.setNumber(len(self.transcripts)) + self.setSize(0) + + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + + def setSize(self, size): + self.minSize = size + self.maxSize = size + + + def setMinSize(self, size): + self.minSize = size + + + def setMaxSize(self, size): + self.maxSize = size + + + def setNumber(self, number): + self.number = number + + + def setStrands(self, strands): + self.strands = strands + + + def setMaxDistribution(self, maxElements): + if maxElements == None: + return + self.maxElements = maxElements + self.distribution = "gaussian" + + + def setDeviationDistribution(self, deviation): + if deviation == None: + return + self.deviation = deviation + self.distribution = "gaussian" + + + def getSizes(self): + if self.sequenceParser == None: + self.chromosomes = [self.chromosomeName] + self.sizes = {self.chromosomeName: self.genomeSize} + self.cumulatedSize = self.genomeSize + self.cumulatedSizes = {self.chromosomeName: self.genomeSize} + return + self.chromosomes = self.sequenceParser.getRegions() + self.sizes = {} + self.cumulatedSize = 0 + self.cumulatedSizes = {} + for chromosome in self.chromosomes: + self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) + self.cumulatedSize += self.sizes[chromosome] + self.cumulatedSizes[chromosome] = self.cumulatedSize + + + def findPosition(self, size = None): + if size == None: + size = random.randint(self.minSize, self.maxSize) + integer = random.randint(0, self.cumulatedSize) + for chromosome in self.chromosomes: + if self.cumulatedSizes[chromosome] > integer: + break + start = random.randint(1, self.sizes[chromosome] - size) + return (chromosome, start, size) + + + def createTranscript(self, chromosome, start, size, strand, cpt): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(start) + transcript.setEnd(start + size-1) + transcript.setDirection(strand) + transcript.setName("rand_%d" % (cpt)) + return transcript + + + def moveTranscript(self, chromosome, start, transcript): + while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: + chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) + transcript.setChromosome(chromosome) + oldStart, oldEnd = transcript.getStart(), transcript.getEnd() + if transcript.getNbExons() > 1: + for exon in transcript.getNbExons(): + oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd() + exon.setStart(oldExonStart + start - oldStart) + exon.setEnd(oldExonEnd + start - oldStart) + transcript.setStart(start) + transcript.setEnd(oldEnd + start - oldStart) + return [transcript] + + + def createUniformCluster(self, chromosome, start, size, strand, cpt): + transcript = self.createTranscript(chromosome, start, size, strand, cpt) + return [transcript] + + + def findNbTranscripts(self, cpt): + return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) + + + def getDev(self): + deviation = 0.0 + for j in range(repetitions): + deviation += random.randint(-self.deviation, self.deviation) + deviation /= repetitions + deviation = int(round(deviation)) + return deviation + + + def createGaussianCluster(self, chromosome, start, size, strand, cpt): + transcripts = [] + nbTranscripts = self.findNbTranscripts(cpt) + for i in range(nbTranscripts): + transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) + transcripts.append(transcript) + return transcripts + + + def writeRegions(self): + writer = Gff3Writer(self.outputFileName, self.verbosity) + outputFile = open(self.outputFileName, "w") + progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) + i = 0 + while i < self.number: + chromosome, start, size = self.findPosition() + strand = random.choice([-1, 1]) if self.strands else 1 + if self.transcripts != None: + transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) + elif self.distribution == "uniform": + transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) + else: + transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) + for transcript in transcripts: + writer.addTranscript(transcript) + i += 1 + progress.inc() + progress.done() + outputFile.close() + writer.write() + writer.close() + + + def run(self): + self.getSizes() + self.writeRegions() + + +if __name__ == "__main__": + + # parse command line + description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") + parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") + parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") + parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") + parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") + parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") + parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") + parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rrg = RandomRegionsGenerator(options.verbosity) + if options.reference == None: + rrg.setGenomeSize(options.referenceSize) + rrg.setChromosomeName(options.chromosome) + else: + rrg.setInput(options.reference) + rrg.setOutputFile(options.outputFileName) + if options.inputFileName == None: + if options.size != None: + rrg.setSize(options.size) + else: + rrg.setMinSize(options.minSize) + rrg.setMaxSize(options.maxSize) + rrg.setNumber(options.number) + rrg.setStrands(options.strands) + else: + rrg.setAnnotation(options.inputFileName, options.format) + rrg.setMaxDistribution(options.max) + rrg.setDeviationDistribution(options.deviation) + rrg.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getReadDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,129 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file sequence [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="keep the best n [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="float", help="keep the best n\% [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output files in PNG format and txt format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + if options.working_Dir[-1] != '/': + options.outputFileName = options.working_Dir + '/' + options.outputFileName + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + else: + raise Exception("Do not understand '%s' file format." % (options.format)) + + progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity) + sequences = {} + for sequence in parser.getIterator(): + sequence = sequence.sequence + if sequence not in sequences: + sequences[sequence] = 1 + else: + sequences[sequence] += 1 + progress.inc() + progress.done() + + values = sequences.values() + values.sort() + if options.percent != None: + threshold = values[int(float(options.percent) / 100 * len(values))] + elif options.number != None: + threshold = values[-options.number] + else: + threshold = 0 + + # sort by value + progress = Progress(parser.getNbSequences(), "Sorting values", options.verbosity) + sortedValues = dict([(value, []) for value in sequences.values()]) + for sequence, value in sequences.iteritems(): + sortedValues[value].append(sequence) + progress.inc() + progress.done() + + outputFileName = "%s.txt" % (options.outputFileName) + handle = open(outputFileName, "w") + progress = Progress(parser.getNbSequences(), "Writing into %s" % (outputFileName), options.verbosity) + for value in reversed(sorted(sortedValues.keys())): + if value >= threshold: + for sequence in sortedValues[value]: + handle.write("%s\t%d\n" % (sequence, value)) + progress.inc() + progress.done() + handle.close() + + line = {} + progress = Progress(len(values), "Preparing plot", options.verbosity) + for value in values: + if value not in line: + line[value] = 1 + else: + line[value] += 1 + progress.inc() + progress.done() + + plot = RPlotter("%s.png" % (options.outputFileName), options.verbosity) + plot.setFill(0) + plot.setMaximumX(options.xMax) + plot.setXLabel("# occurrences") + plot.setYLabel("# reads") + plot.addLine(line) + plot.plot() + + if options.verbosity > 0: + print "%d/%.2f/%.1f/%d occurrences" % (Utils.getMinAvgMedMax(line)) + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getSequence.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,60 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get a given sequence in a multi-Fasta file""" +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.FastaWriter import FastaWriter + +if __name__ == "__main__": + + # parse command line + description = "Get Sequence v1.0.1: Get a single sequence in a FASTA file. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName",action="store",type="string", help="multi-FASTA file [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--name",dest="name",action="store",type="string", help="name of the sequence [compulsory] [format: string]") + parser.add_option("-o", "--output",dest="outputFileName",action="store",type="string", help="output sequence file (FASTA) [compulsory] [format: file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + # read Fasta file + sequenceListParser = FastaParser(options.inputFileName, options.verbosity) + for sequence in sequenceListParser.getIterator(): + name = sequence.name.split(" ")[0] + if name == options.name: + writer = FastaWriter(options.outputFileName, options.verbosity) + writer.addSequence(sequence) + print sequence.printFasta(), + sys.exit(0) + writer.close() + print "No sequence found" diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getSizes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getSizes.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,218 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc import Utils + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser + +LOG_DEPTH = "smart" + +class GetSizes(object): + + def __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0): + self.inFileName = inFileName + self.inFormat= inFormat + self.outFileName = outFileName + self.query = query + self.xMax = xMax + self.xMin = xMin + self.xLab = "Size" + self.yLab = "# reads" + self.barplot = False + self._verbosity = verbosity + self.parser = None + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "Usage: getSizes.py [options]\n\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\n" + epilog = "" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the input [compulsory] [format: transcript or sequence file format]") + parser.add_option("-q", "--query", dest="query", action="store", default=None, type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in PNG format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-X", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-a", "--xLabel", dest="xLab", action="store", default="Size", type="string", help="x absis label name [format: string] [default: Size]") + parser.add_option("-b", "--yLabel", dest="yLab", action="store", default="# reads", type="string", help="y absis label name [format: string] [default: Reads]") + parser.add_option("-B", "--barplot", dest="barplot", action="store_true", default=False, help="use barplot representation [format: bool] [default: false]") + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setInFileName(options.inputFileName) + self.setInFormat(options.format) + self.setQuery(options.query) + self.setOutFileName(options.outputFileName) + self.setXMax(options.xMax) + self.setXMin(options.xMin) + self.setxLab(options.xLab) + self.setyLab(options.yLab) + self.setBarplot(options.barplot) + self.setVerbosity(options.verbosity) + + def setInFileName(self, inputFileName): + self.inFileName = inputFileName + + def setInFormat(self, inFormat): + self.inFormat = inFormat + + def setQuery(self, query): + self.query = query + + def setOutFileName(self, outFileName): + self.outFileName = outFileName + + def setXMax(self, xMax): + self.xMax = xMax + + def setXMin(self, xMin): + self.xMin = xMin + + def setxLab(self, xLab): + self.xLab = xLab + + def setyLab(self, yLab): + self.yLab = yLab + + def setBarplot(self, barplot): + self.barplot = barplot + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if self.inFileName == None: + self._logAndRaise("ERROR: Missing input file name") + if self.inFormat == "fasta": + self.parser = FastaParser(self.inFileName, self._verbosity) + elif self.inFormat == "fastq": + self.parser = FastqParser(self.inFileName, self._verbosity) + else: + self.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START getsizes") + self._log.debug("Input file name: %s" % self.inFileName) + + nbItems = self.parser.getNbItems() + self._log.info( "%i items found" % (nbItems)) + + # treat items + progress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity) + sizes = {} + minimum = 1000000000000 + maximum = 0 + sum = 0 + number = 0 + nbSubItems = 0 + for item in self.parser.getIterator(): + items = [] + if self.query == "exon": + items = item.getExons() + elif self.query == "exon1": + if len(item.getExons()) > 1: + item.sortExons() + items = [item.getExons()[0]] + elif self.query == "intron": + items = item.getIntrons() + else: + items = [item, ] + + for thisItem in items: + try: + nbElements = int(float(thisItem.getTagValue("nbElements"))) + if nbElements == None: + nbElements = 1 + except: + nbElements = 1 + size = thisItem.getSize() + minimum = min(minimum, size) + maximum = max(maximum, size) + + if size not in sizes: + sizes[size] = nbElements + else: + sizes[size] += nbElements + sum += size + nbSubItems += nbElements + number += 1 + progress.inc() + progress.done() + + if self.outFileName != None: + plotter = RPlotter(self.outFileName, self._verbosity) + plotter.setFill(0) + plotter.setMinimumX(self.xMin) + plotter.setMaximumX(self.xMax) + plotter.setXLabel(self.xLab) + plotter.setYLabel(self.yLab) + plotter.setBarplot(self.barplot) + plotter.addLine(sizes) + plotter.plot() + + if nbSubItems == 0: + self._logAndRaise("No item found") + + self.items = number + self.subItems = nbSubItems + self.nucleotides = sum + self.minAvgMedMax = Utils.getMinAvgMedMax(sizes) + + print "%d items" % (number) + print "%d sub-items" % (nbSubItems) + print "%d nucleotides" % (sum) + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes) + + self._log.info("END getsizes") + + +if __name__ == "__main__": + iGetSizes = GetSizes() + iGetSizes.setAttributesFromCmdLine() + iGetSizes.run() + +#TODO: add two more options!!!!!! diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getWigData.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getWigData.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,67 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Data v1.0.1: Compute the average data for some genomic coordinates using WIG files (thus covering a large proportion of the genome) and update a tag. [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="choose a tag name to write the wig information to output file [compulsory] [format: file in WIG format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parsers and writers + transcriptParser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + wigParser = WigParser(options.wig) + writer = Gff3Writer(options.outputFileName, options.verbosity) + wigParser.setStrands(options.strands) + + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptParser.getIterator(): + values = transcript.extractWigData(wigParser) + if options.strands: + values = values[transcript.getDirection()] + transcript.setTagValue(options.tag, str(float(sum(values)) / len(values))) + writer.addTranscript(transcript) + progress.inc() + progress.done() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getWigDistance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getWigDistance.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,105 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Data v1.0.2: Compute the average data around some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=1000, type="int", help="distance around position [compulsory] [format: int] [default: 1000]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-a", "--default", dest="defaultValue", action="store", default=0.0, type="float", help="default value (when value is NA) [default: 0.0] [format: float]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="use log scale for y-axis [format: boolean] [default: False]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parsers and writers + transcriptParser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + wigParser = WigParser(options.wig) + wigParser.setStrands(options.strands) + wigParser.setDefaultValue(options.defaultValue) + + # allocate data + strands = (1, -1) if options.strands else (1, ) + values = {} + for strand in strands: + values[strand] = dict([(i, 0.0) for i in range(-options.distance, options.distance+1)]) + + # read transcripts + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptParser.getIterator(): + transcript.removeExons() + transcript.restrictStart(2) + transcript.extendStart(options.distance) + transcript.extendEnd(options.distance-1) + theseValues = transcript.extractWigData(wigParser) + if len(strands) == 1: + theseValues = {1: theseValues} + for strand in strands: + if len(theseValues[strand]) < 2 * options.distance + 1: + theseValues[strand] = [options.defaultValue] * (2 * options.distance + 1 - len(theseValues[strand])) + theseValues[strand] + if len(theseValues[strand]) != 2 * options.distance + 1: + raise Exception("Got something wrong with the size of the WIG data concerning %s: %d found instead of %d" % (transcript, len(theseValues[strand]), 2 * options.distance + 1)) + for i in range(-options.distance, options.distance+1): + values[strand][i] += theseValues[strand][i + options.distance] + progress.inc() + progress.done() + + for strand in strands: + for i in range(-options.distance, options.distance+1): + values[strand][i] /= transcriptParser.getNbTranscripts() * strand + + # draw plot + plotter = RPlotter(options.outputFileName, options.verbosity, options.keep) + plotter.setXLabel("Distance") + plotter.setYLabel("WigValue") + for strand in strands: + plotter.addLine(values[strand]) + if options.log: + plotter.setLog("y") + plotter.plot() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/getWigProfile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getWigProfile.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,160 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +import math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + +class GetWigProfile(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.values = {} + self.defaultValue = 0.0 + + def _iToJ(self, i, size): + return min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints)))) + + def readTranscripts(self): + self.strandNames = (1, -1) if self.strands else (1, ) + self.values = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) + transcriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) + wigParser = WigParser(self.wig) + nbValues = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames]) + wigParser.setStrands(self.strands) + wigParser.setDefaultValue(self.defaultValue) + + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity) + for transcript in transcriptParser.getIterator(): + transcriptSize = transcript.getSize() + expectedSize = transcriptSize + 2 * self.distance + transcript.extendStart(self.distance) + transcript.extendEnd(self.distance) + theseValues = transcript.extractWigData(wigParser) + + if len(self.strandNames) == 1: + theseValues = {1: theseValues} + for strand in self.strandNames: + if len(theseValues[strand]) < expectedSize: + theseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand] + if len(theseValues[strand]) != expectedSize: + raise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize)) + fivePValues = theseValues[strand][: self.distance] + nbValues = [0.0] * (self.nbPoints) + transcriptValues = [0.0] * (self.nbPoints) + for i in range(self.distance, len(theseValues[strand]) - self.distance): + startJ = self._iToJ(i, transcriptSize) + endJ = max(startJ+1, self._iToJ(i+1, transcriptSize)) + for j in range(startJ, endJ): + transcriptValues[j] += theseValues[strand][i] + nbValues[j] += 1 + threePValues = theseValues[strand][-self.distance: ] + values = fivePValues + [self.defaultValue if nbValue == 0 else transcriptValue / nbValue for transcriptValue, nbValue in zip(transcriptValues, nbValues)] + threePValues + for i, value in enumerate(values): + self.values[strand][i] += value + progress.inc() + progress.done() + + for strand in self.strandNames: + if strand == 0: + strand = 1 + for i in range(self.nbPoints + 2 * self.distance): + self.values[strand][i] /= transcriptParser.getNbTranscripts() * strand + + + def smoothen(self): + if self.smoothenForce == None: + return + for strand in self.strandNames: + averageValues = {} + for center in range(self.distance, self.distance + self.nbPoints): + sum = 0.0 + nbValues = 0.0 + for i in range(center - self.smoothenForce + 1, center + self.smoothenForce): + if i > self.distance and i < self.distance + self.nbPoints: + nbValues += 1 + sum += self.values[strand][i] + averageValues[center] = sum / nbValues + for position in range(self.distance, self.distance + self.nbPoints): + self.values[strand][position] = averageValues[position] + + + def plot(self): + plotter = RPlotter(self.outputFileName, self.verbosity) + for strand in self.strandNames: + plotter.addLine(self.values[strand]) + if self.log: + plotter.setLog("y") + plotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance}) + plotter.plot() + + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-p", "--nbPoints", dest="nbPoints", action="store", default=1000, type="int", help="number of points on the x-axis [compulsory] [format: int] [default: 1000]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="distance around genomic coordinates [compulsory] [format: int] [default: 0]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-m", "--smoothen", dest="smoothen", action="store", default=None, type="int", help="smoothen the curve [format: int] [default: None]") + parser.add_option("-a", "--default", dest="defaultValue", action="store", default=0.0, type="float", help="default value (when value is NA) [default: 0.0] [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="use log scale for y-axis [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + wigProfile = GetWigProfile(options.verbosity) + wigProfile.strands = options.strands + wigProfile.inputFileName = options.inputFileName + wigProfile.inputFormat = options.inputFormat + wigProfile.wig = options.wig + wigProfile.nbPoints = options.nbPoints + wigProfile.distance = options.distance + wigProfile.smoothenForce = options.smoothen + wigProfile.defaultValue = options.defaultValue + wigProfile.outputFileName = options.outputFileName + wigProfile.log = options.log + + wigProfile.readTranscripts() + wigProfile.smoothen() + wigProfile.plot() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mapperAnalyzer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mapperAnalyzer.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,486 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a mapping file (many formats supported) and select some of them +Mappings should be sorted by read names +""" +import os, random, shelve +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.BedWriter import BedWriter +from commons.core.writer.UcscWriter import UcscWriter +from commons.core.writer.GbWriter import GbWriter +from commons.core.writer.Gff2Writer import Gff2Writer +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +distanceExons = 20 +exonSize = 20 + + +class MapperAnalyzer(object): + """ + Analyse the output of a parser + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(verbosity) + self.tooShort = 0 + self.tooManyMismatches = 0 + self.tooManyGaps = 0 + self.tooShortExons = 0 + self.tooManyMappings = 0 + self.nbMappings = 0 + self.nbSequences = 0 + self.nbAlreadyMapped = 0 + self.nbAlreadyMappedSequences = 0 + self.nbWrittenMappings = 0 + self.nbWrittenSequences = 0 + self.parser = None + self.logHandle = None + self.randomNumber = random.randint(0, 100000) + self.gff3Writer = None + self.alreadyMappedReader = None + self.unmatchedWriter = None + self.sequenceListParser = None + self.sequences = None + self.alreadyMapped = None + self.mappedNamesTable = None + self.minSize = None + self.minId = None + self.maxMismatches = None + self.maxGaps = None + self.maxMappings = None + self.merge = False + self.checkExons = False + self.suffix = None + self.tmpDirectory = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else "" + + + def __del__(self): + if self.sequences != None: + self.sequences.close() + if self.alreadyMapped != None: + self.alreadyMapped.close() + if self.mappedNamesTable != None: + self.mappedNamesTable.remove() + if self.gff3Writer != None: + self.gff3Writer.close() + + if self.logHandle != None: + self.logHandle.close() + + + def setMappingFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "mapping") + self.parser = parserChooser.getParser(fileName) + + + def setSequenceFile(self, fileName, format): + if format == "fasta": + self.sequenceListParser = FastaParser(fileName, self.verbosity) + elif format == "fastq": + self.sequenceListParser = FastqParser(fileName, self.verbosity) + else: + raise Exception("Do not understand sequence format %s" % (format)) + + + def setOutputFile(self, fileName, title): + self.gff3Writer = Gff3Writer(fileName, self.verbosity) + self.gff3Writer.setTitle(title) + + + def setAlreadyMatched(self, fileName): + self.alreadyMappedReader = GffParser(fileName, self.verbosity) + + + def setRemainingFile(self, fileName, format): + if format == "fasta": + self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity) + elif format == "fastq": + self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity) + else: + raise Exception("Do not understand %s format." % (format)) + self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity) + self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50}) + self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True) + + + def setLog(self, fileName): + self.logHandle = open(fileName, "w") + + + def setMinSize(self, size): + self.minSize = size + + + def setMinId(self, id): + self.minId = id + + + def setMaxMismatches(self, mismatches): + self.maxMismatches = mismatches + + + def setMaxGaps(self, gaps): + self.maxGaps = gaps + + + def setMaxMappings(self, mappings): + self.maxMappings = mappings + + + def mergeExons(self, b): + self.merge = b + + + def acceptShortExons(self, b): + self.checkExons = not b + + + def countMappings(self): + self.nbMappings = self.parser.getNbMappings() + if self.verbosity > 0: + print "%i matches found" % (self.nbMappings) + + + def storeAlreadyMapped(self): + self.alreadyMapped = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber)) + progress = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity) + self.nbAlreadyMappedSequences = 0 + for transcript in self.alreadyMappedReader.getIterator(): + if not self.alreadyMapped.has_key(transcript.getName()): + self.alreadyMapped[transcript.getName()] = 1 + self.nbAlreadyMappedSequences += 1 + progress.inc() + progress.done() + self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts() + + + def storeSequences(self): + self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber)) + progress = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity) + for sequence in self.sequenceListParser.getIterator(): + self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence()) + self.nbSequences += 1 + progress.inc() + progress.done() + if self.verbosity > 0: + print "%i sequences read" % (self.nbSequences) + + + def checkOrder(self): + names = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber)) + previousName = None + progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity) + for mapping in self.parser.getIterator(): + name = mapping.queryInterval.getName() + if name != previousName and previousName != None: + if names.has_key(previousName): + raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName)) + names[previousName] = 1 + previousName = name + progress.inc() + progress.done() + names.close() + + + def checkPreviouslyMapped(self, name): + if self.alreadyMappedReader == None: + return False + return self.alreadyMapped.has_key(name) + + + def findOriginalSize(self, name): + alternate = "%s/1" % (name) + if (self.suffix == None) or (not self.suffix): + if self.sequences.has_key(name): + self.suffix = False + return self.sequences[name] + if self.suffix == None: + self.suffix = True + else: + raise Exception("Cannot find name %n" % (name)) + if (self.suffix): + if self.sequences.has_key(alternate): + return self.sequences[alternate] + raise Exception("Cannot find name %s" % (name)) + + + def checkErrors(self, mapping): + accepted = True + # short size + if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size: + self.tooShort += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size)) + # low identity + if self.minId != None and mapping.getTagValue("identity") < self.minId: + self.tooManyMismatches += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping))) + # too many mismatches + if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches: + self.tooManyMismatches += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches)) + # too many gaps + if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps: + self.tooManyGaps += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps)) + # short exons + if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize: + self.tooShortExons += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("sequence %s maps as too short exons\n" % (mapping)) + return accepted + + + def checkNbMappings(self, mappings): + nbOccurrences = 0 + for mapping in mappings: + nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") + if (self.maxMappings != None and nbOccurrences > self.maxMappings): + self.tooManyMappings += 1 + if self.logHandle != None: + self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences)) + return False + return (nbOccurrences > 0) + + + def sortMappings(self, mappings): + nbOccurrences = 0 + for mapping in mappings: + nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") + + orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore()) + cpt = 1 + rank = 1 + previousMapping = None + previousScore = None + wasLastTie = False + rankedMappings = [] + bestRegion = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd()) + for mapping in orderedMappings: + mapping.setNbOccurrences(nbOccurrences) + mapping.setOccurrence(cpt) + + score = mapping.getErrorScore() + if previousScore != None and previousScore == score: + if "Rank" in previousMapping.getTagNames(): + if not wasLastTie: + previousMapping.setRank("%sTie" % (rank)) + mapping.setRank("%sTie" % (rank)) + wasLastTie = True + else: + rank = cpt + mapping.setRank(rank) + wasLastTie = False + if cpt != 1: + mapping.setBestRegion(bestRegion) + + rankedMappings.append(mapping) + previousMapping = mapping + previousScore = score + cpt += 1 + return rankedMappings + + + def processMappings(self, mappings): + if not mappings: + return + selectedMappings = [] + name = mappings[0].queryInterval.getName() + size = self.findOriginalSize(name) + for mapping in mappings: + if self.merge: + mapping.mergeExons(distanceExons) + mapping.queryInterval.size = size + if self.checkErrors(mapping): + selectedMappings.append(mapping) + + if self.checkNbMappings(selectedMappings): + if self.unmatchedWriter != None: + query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name))) + self.nbWrittenSequences += 1 + mappings = self.sortMappings(selectedMappings) + for mapping in mappings: + self.nbWrittenMappings += 1 + self.gff3Writer.addTranscript(mapping.getTranscript()) + + + def readMappings(self): + previousQueryName = None + mappings = [] + self.parser.reset() + progress = Progress(self.nbMappings, "Reading mappings", self.verbosity) + for mapping in self.parser.getIterator(): + queryName = mapping.queryInterval.getName().split(" ")[0] + if self.checkPreviouslyMapped(queryName): + if self.logHandle != None: + self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName)) + else: + if previousQueryName == queryName: + mappings.append(mapping) + else: + if previousQueryName != None: + self.processMappings(mappings) + previousQueryName = queryName + mappings = [mapping, ] + progress.inc() + self.processMappings(mappings) + self.gff3Writer.write() + self.gff3Writer.close() + progress.done() + + + def writeUnmatched(self): + progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity) + for sequence in self.sequenceListParser.getIterator(): + name = sequence.getName().split(" ")[0] + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name)) + if query.isEmpty(): + self.unmatchedWriter.addSequence(sequence) + progress.inc() + progress.done() + + + def analyze(self): + self.countMappings() + self.checkOrder() + self.storeSequences() + if self.alreadyMappedReader != None: + self.storeAlreadyMapped() + self.readMappings() + if self.unmatchedWriter != None: + self.writeUnmatched() + + + + +if __name__ == "__main__": + + # parse command line + description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]" + + parser = OptionParser(description = description) + compGroup = OptionGroup(parser, "Compulsory options") + filtGroup = OptionGroup(parser, "Filtering options") + tranGroup = OptionGroup(parser, "Transformation options") + outpGroup = OptionGroup(parser, "Output options") + otheGroup = OptionGroup(parser, "Other options") + compGroup.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]") + compGroup.add_option("-f", "--format", dest="format", action="store", default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]") + compGroup.add_option("-q", "--sequences", dest="sequencesFileName", action="store", type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]") + compGroup.add_option("-k", "--seqFormat", dest="sequenceFormat", action="store", default="fasta", type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]") + compGroup.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + filtGroup.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="max. number of occurrences of a sequence [format: int]") + filtGroup.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="minimum pourcentage of size [format: int]") + filtGroup.add_option("-d", "--identity", dest="identity", action="store", default=None, type="int", help="minimum pourcentage of identity [format: int]") + filtGroup.add_option("-m", "--mismatch", dest="mismatch", action="store", default=None, type="int", help="maximum number of mismatches [format: int]") + filtGroup.add_option("-p", "--gap", dest="gap", action="store", default=None, type="int", help="maximum number of gaps [format: int]") + tranGroup.add_option("-e", "--mergeExons", dest="mergeExons", action="store_true", default=False, help="merge exons when introns are short [format: bool] [default: false]") + tranGroup.add_option("-x", "--removeExons", dest="removeExons", action="store_true", default=False, help="remove transcripts when exons are short [format: bool] [default: false]") + outpGroup.add_option("-t", "--title", dest="title", action="store", default="SMART", type="string", help="title of the UCSC track [format: string] [default: SMART]") + outpGroup.add_option("-r", "--remaining", dest="remaining", action="store_true", default=False, help="print the unmatched sequences [format: bool] [default: false]") + otheGroup.add_option("-a", "--append", dest="appendFileName", action="store", default=None, type="string", help="append to GFF3 file [format: file in GFF3 format]") + otheGroup.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + otheGroup.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + parser.add_option_group(compGroup) + parser.add_option_group(filtGroup) + parser.add_option_group(tranGroup) + parser.add_option_group(outpGroup) + parser.add_option_group(otheGroup) + (options, args) = parser.parse_args() + + + analyzer = MapperAnalyzer(options.verbosity) + analyzer.setMappingFile(options.inputFileName, options.format) + analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat) + analyzer.setOutputFile(options.outputFileName, options.title) + if options.appendFileName != None: + analyzer.setAlreadyMatched(options.appendFileName) + if options.remaining: + analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat) + if options.number != None: + analyzer.setMaxMappings(options.number) + if options.size != None: + analyzer.setMinSize(options.size) + if options.identity != None: + analyzer.setMinId(options.identity) + if options.mismatch != None: + analyzer.setMaxMismatches(options.mismatch) + if options.gap != None: + analyzer.setMaxGaps(options.gap) + if options.mergeExons: + analyzer.mergeExons(True) + if options.removeExons: + analyzer.acceptShortExons(False) + if options.log: + analyzer.setLog("%s.log" % (options.outputFileName)) + analyzer.analyze() + + if options.verbosity > 0: + print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) + if options.appendFileName != None: + print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) + print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100) + if options.appendFileName != None: + print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100) + print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100) + print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100) + print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100) + print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100) + print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100) + print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) + if options.appendFileName != None: + print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mappingToCoordinates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mappingToCoordinates.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,91 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + + +"""Convert files with some mapping format to coordinates format""" + +import os +from optparse import OptionParser +from commons.core.parsing.PslParser import PslParser +from commons.core.parsing.AxtParser import AxtParser +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress + + +class MappingToCoordinates(object): + def __init__(self,verbosity=1, inputFileName=None, format = None, output=None,galaxy = False, title="S-MART"): + self.verbosity = verbosity + self.inputFileName = inputFileName + self.format = format + self.output = output + self.galaxy = galaxy + self.title = title + + def setAttributesFromCmdLine(self): + description = "Mapping To Coordinates v1.0.1: Convert a set of mappings (given by a mapping tool) to a set of transcripts. [Category: Conversion]" + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-G", "--galaxy", dest="galaxy", action="store_true", default=False, help="used for galaxy [format: bool] [default: False]") + (options, args) = parser.parse_args() + + self.verbosity = options.verbosity + self.inputFileName = options.inputFileName + self.format = options.format + self.output = options.output + self.galaxy = options.galaxy + + def run(self): + if self.verbosity > 0: + print "Reading input file..." + parser = TranscriptContainer(self.inputFileName, self.format, self.verbosity) + if self.verbosity > 0: + print "... done" + writer = Gff3Writer(self.output, self.verbosity, self.title) + + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (self.inputFileName), self.verbosity) + for transcript in parser.getIterator(): + writer.addTranscript(transcript) + progress.inc() + progress.done() + + if self.galaxy: + os.rename("%s.gff3" % (self.output), self.output) + +if __name__ == '__main__': + launcher = MappingToCoordinates() + launcher.setAttributesFromCmdLine() + launcher.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mergeSlidingWindowsClusters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mergeSlidingWindowsClusters.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,144 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Merge sliding windows of two different clusterings +""" + +import sys +import re +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.structure.Transcript import Transcript + +class MergeSlidingWindowsClusters(object): + """ + Merge the ouptput of several sets of sliding windows + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputs = [] + self.outputData = {} + self.nbData = 0 + self.nbWrittenData = 0 + self.chromosomes = [] + self.writer = None + + def __del__(self): + if self.writer != None: + self.writer.close() + + def addInput(self, fileName, fileFormat): + self.inputs.append(TranscriptContainer(fileName, fileFormat, self.verbosity)) + self.chromosomes = list(set(self.chromosomes).union(set(self.inputs[-1].getChromosomes()))) + + def setOutput(self, fileName): + self.writer = Gff3Writer(fileName, self.verbosity) + + def readInput(self, i, chromosome): + progress = Progress(self.inputs[i].getNbTranscripts(), "Reading file #%d -- chromosome %s" % (i+1, chromosome), self.verbosity) + for transcript in self.inputs[i].getIterator(): + progress.inc() + if chromosome != transcript.getChromosome(): continue + start = transcript.getStart() + end = transcript.getEnd() + direction = transcript.getDirection() + tags = transcript.tags + if chromosome not in self.outputData: + self.outputData[chromosome] = {} + if direction not in self.outputData[chromosome]: + self.outputData[chromosome][direction] = {} + if start not in self.outputData[chromosome][direction]: + self.outputData[chromosome][direction][start] = {} + if end in self.outputData[chromosome][direction][start]: + ends = self.outputData[chromosome][direction][start].keys() + if ends[0] != end: + sys.exit("Error! Two regions starting at %d end are not consistent (%d and %d) in %s on strand %d" % (start, end, ends[0], chromosome, direction)) + self.outputData[chromosome][direction][start][end].update(tags) + else: + self.outputData[chromosome][direction][start][end] = tags + self.nbData += 1 + progress.done() + + + def writeOutput(self, chromosome): + progress = Progress(self.nbData - self.nbWrittenData, "Writing output for chromosome %s" % (chromosome), self.verbosity) + for direction in self.outputData[chromosome]: + for start in self.outputData[chromosome][direction]: + for end in self.outputData[chromosome][direction][start]: + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setDirection(direction) + transcript.tags = self.outputData[chromosome][direction][start][end] + transcript.setName("region_%d" % (self.nbWrittenData + 1)) + tags = transcript.getTagNames() + for tag in tags: + if tag.startswith("Name_") or tag.startswith("ID_"): + del transcript.tags[tag] + self.nbWrittenData += 1 + self.writer.addTranscript(transcript) + progress.inc() + self.writer.write() + progress.done() + self.outputData = {} + + def merge(self): + for chromosome in self.chromosomes: + for i, input in enumerate(self.inputs): + self.readInput(i, chromosome) + self.writeOutput(chromosome) + self.writer.close() + + +if __name__ == "__main__": + + # parse command line + description = "Merge Sliding Windows Clusters v1.0.2: Merge two files containing the results of a sliding windows clustering. [Category: Sliding Windows]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of the input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of the input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + merger = MergeSlidingWindowsClusters(options.verbosity) + merger.addInput(options.inputFileName1, options.inputFormat1) + merger.addInput(options.inputFileName2, options.inputFormat2) + merger.setOutput(options.outputFileName) + merger.merge() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mergeTranscriptLists.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mergeTranscriptLists.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,174 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Merge elements of two transcript lists with some condition""" + +import os, random, shutil, glob +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import SequenceListParser +from commons.core.parsing.BedParser import BedParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress + + + +class MergeLists(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.seed = random.randint(0, 100000) + self.aggregation = False + self.normalization = False + self.distance = False + self.antisense = False + self.colinear = False + self.fileNames = {} + self.formats = {} + self.tmpFileNames = [] + self.logHandle = None + +# def __del__(self): +# for fileNameRoot in self.tmpFileNames: +# for fileName in glob.glob("%s*" % (fileNameRoot)): +# os.remove(fileName) +# if self.logHandle != None: +# self.logHandle.close() +# self.logHandle = None + + def setLogFileName(self, fileName): + self.logHandle = open(fileName, "w") + + def setInputFileName(self, fileName, format, id): + self.fileNames[id] = fileName + self.formats[id] = format + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setAggregate(self, aggregation): + self.aggregation = aggregation + + def setNormalization(self, normalization): + self.normalization = normalization + + def setDistance(self, distance): + self.distance = distance + + def setAntisense(self, antisense): + self.antisense = antisense + + def setColinear(self, colinear): + self.colinear = colinear + + def createTmpFileName(self, root): + fileName = "tmp_%s_%d.gff3" % (root, self.seed) + self.tmpFileNames.append(fileName) + return fileName + + def selfMerge(self, fileName, format, outputFileName): + transcriptListComparator = TranscriptListsComparator(self.logHandle, self.verbosity) + transcriptListComparator.getColinearOnly(True) + transcriptListComparator.setNormalization(self.normalization) + transcriptContainer = TranscriptContainer(fileName, format, self.verbosity) + writer = TranscriptWriter(outputFileName, "gff3", self.verbosity) + transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer) + transcriptListComparator.setOutputWriter(writer) + transcriptListComparator.compareTranscriptListSelfMerge() + + def keepOverlapping(self, fileNames, formats, outputFileName): + transcriptListComparator = TranscriptListsComparator(self.logHandle, self.verbosity) + transcriptListComparator.getAntisenseOnly(self.antisense) + transcriptListComparator.getColinearOnly(self.colinear) + for i in (0, 1): + transcriptContainer = TranscriptContainer(fileNames[i], formats[i], self.verbosity) + transcriptListComparator.setInputTranscriptContainer(i, transcriptContainer) + transcriptListComparator.aggregate(self.aggregation) + transcriptListComparator.setNormalization(self.normalization) + transcriptListComparator.setMaxDistance(self.distance) + writer = TranscriptWriter(outputFileName, "gff3", self.verbosity) + transcriptListComparator.setOutputWriter(writer) + transcriptListComparator.compareTranscriptList() + + def mergeFiles(self, fileName1, fileName2, outputFileName): + outputFile = open(outputFileName, "w") + shutil.copyfileobj(open(fileName1, "r"), outputFile) + shutil.copyfileobj(open(fileName2, "r"), outputFile) + outputFile.close() + + def run(self): + selectedFileQuery = self.createTmpFileName("query") + self.keepOverlapping({0: self.fileNames[0], 1: self.fileNames[0]}, {0: "gff3", 1: "gff3"}, selectedFileQuery) + mergeFileTarget = self.createTmpFileName("target") + self.selfMerge(self.fileNames[1], self.formats[1], mergeFileTarget) + if not self.aggregation: + overlapFile = self.createTmpFileName("overlap") + self.keepOverlapping({0: mergeFileTarget, 1: selectedFileQuery}, {0: "gff3", 1: "gff3"}, overlapFile) + mergeFileTarget = overlapFile + mergeFileMerged = self.createTmpFileName("merged") + self.mergeFiles(mergeFileTarget, selectedFileQuery, mergeFileMerged) + self.selfMerge(mergeFileMerged, "gff3", self.outputFileName) + + + +if __name__ == "__main__": + + # parse command line + description = "Merge Lists v1.0.3: Merge the elements of two lists of genomic coordinates. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default=None, type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", default=None, type="string", help="format of file 2 [compulsory] [format: file in transcript format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-k", "--all", dest="all", action="store_true", default=False, help="print all the transcripts, not only those overlapping [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts [format: int] [default: 0]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + +# ml = MergeLists(logHandle, options.verbosity) + + ml = MergeLists(0) + ml.setInputFileName(options.inputFileName1, options.format1, 0) + ml.setInputFileName(options.inputFileName2, options.format2, 1) + ml.setOutputFileName(options.outputFileName) + ml.setAntisense(options.antisense) + ml.setColinear(options.colinear) + ml.setAggregate(options.all) + ml.setNormalization(options.normalize) + ml.setDistance(options.distance) + ml.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/misc/MultipleRPlotter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/MultipleRPlotter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,160 @@ +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import subprocess +import random +import math +from SMART.Java.Python.misc.RPlotter import RPlotter + +NBCOLORS = 9 + +""" +Plot multiple curves with RPlotter +""" + +class MultipleRPlotter(object): + """ + Plot some curves + @ivar fileName: name of the file + @type fileName: string + @ivar height: height of the file + @type height: int + @ivar width: width of the file + @type width: int + @ivar plots: plots to be included + @type plots: list of L{RPlotter{RPlotter}} + @ivar keep: keep script lines + @type keep: boolean + @ivar format: format of the file + @type format: string + """ + + def __init__(self, fileName, verbosity = 0, keep = False): + """ + Constructor + @param fileName: name of the file to produce + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + @param keep: keep temporary files + @type keep: boolean + """ + self.fileName = fileName + self.verbosity = verbosity + self.keep = keep + self.format = "png" + self.width = 1000 + self.height = 500 + self.plots = [] + self.scriptFileName = "tmpScript-%d.R" % (os.getpid()) + + def __del__(self): + """ + Destructor + Remove script files + """ + if not self.keep: + if os.path.exists(self.scriptFileName): + os.remove(self.scriptFileName) + outputFileName = "%sout" % (self.scriptFileName) + if os.path.exists(outputFileName): + os.remove(outputFileName) + + def setFormat(self, format): + """ + Set the format of the picture + @param format: the format + @type format: string + """ + if format not in ("png", "pdf", "jpeg", "bmp", "tiff"): + raise Exception("Format '%s' is not supported by RPlotter" % (format)) + self.format = format + + + def setWidth(self, width): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + """ + self.width = width + + + def setHeight(self, height): + """ + Set the dimensions of the image produced + @param height: heigth of the image + @type height: int + """ + self.height = height + + + def setImageSize(self, width, height): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + @param height: heigth of the image + @type height: int + """ + self.width = width + self.height = height + + def addPlot(self, plot): + """ + Add a plot + @param plots: plot to be included + @type plots: L{RPlotter{RPlotter}} + """ + self.plots.append(plot) + + def plot(self): + """ + Plot the figures + """ + scriptHandle = open(self.scriptFileName, "w") + scriptHandle.write("library(RColorBrewer)\n") + scriptHandle.write("colorPanel = brewer.pal(n=%d, name=\"Set1\")\n" % (NBCOLORS)) + scriptHandle.write("%s(%s = \"%s\", width = %d, height = %d, bg = \"white\")\n" % (self.format, "filename" if self.format != "pdf" else "file", self.fileName, self.width, self.height)) + scriptHandle.write("par(mfrow=c(%d, 1))\n" % (len(self.plots))) + for plot in self.plots: + scriptHandle.write(plot.getScript()) + scriptHandle.write("dev.off()\n") + scriptHandle.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, self.scriptFileName) + status = subprocess.call(command, shell=True) + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.scriptFileName, status)) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/misc/Progress.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/Progress.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,93 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import time + +class Progress(object): + """Show the progress of a process""" + + def __init__(self, aim, message = "Progress", verbosity = 0): + self.aim = aim + self.progress = 0 + self.message = message + self.length = -1 + self.verbosity = verbosity + self.maxMessageSize = 50 + self.barSize = 80 + self.startTime = time.time() + self.elapsed = 0 + if len(self.message) > self.maxMessageSize: + self.message = self.message[0:self.maxMessageSize-3] + "..." + self.show() + + + def inc(self): + self.progress += 1 + self.show() + + + def getPrintableElapsedTime(self, time): + timeHou = int(time) / 3600 + timeMin = int(time) / 60 - 60 * timeHou + timeSec = int(time) % 60 + if timeHou > 0: + return "%3dh %2dm" % (timeHou, timeMin) + if timeMin > 0: + return "%2dm %2ds" % (timeMin, timeSec) + return "%2ds " % (timeSec) + + + def show(self): + if self.verbosity <= 0: + return + if self.aim == 0: + return + messageSize = len(self.message) + length = int(self.progress / float(self.aim) * self.barSize) + elapsed = int(time.time() - self.startTime) + if (length > self.length) or (elapsed > self.elapsed + 10): + self.length = length + self.elapsed = elapsed + string = "%s%s[%s%s] %d/%d" % (self.message, " " * max(0, self.maxMessageSize - messageSize), "=" * self.length, " " * (self.barSize - self.length), self.progress, self.aim) + if elapsed > 5: + done = float(self.progress) / self.aim + total = elapsed / done + remaining = total - elapsed + string += " ETA: %s " % (self.getPrintableElapsedTime(remaining)) + string += "\r" + sys.stdout.write(string) + sys.stdout.flush() + + + def done(self): + if self.verbosity > 0: + messageSize = len(self.message) + elapsed = time.time() - self.startTime + print "%s%s[%s] %d completed in %s " % (self.message, " " * max(0, self.maxMessageSize - messageSize), "=" * self.barSize, self.aim, self.getPrintableElapsedTime(elapsed)) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/misc/RPlotter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/RPlotter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,821 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import subprocess +import random +import math + +minPositiveValue = 10e-6 + +""" +Plot simple curves in R +""" + +class RPlotter(object): + """ + Plot some curves + @ivar nbColors: number of different colors + @type nbColors: int + @ivar fileName: name of the file + @type fileName: string + @ivar lines: lines to be plotted + @type lines: array of dict + @ivar names: name of the lines + @type names: array of strings + @ivar colors: color of the lines + @type colors: array of strings + @ivar types: type of the lines (plain or dashed) + @type types: array of strings + @ivar format: format of the picture + @type format: string + @ivar lineWidth: width of the line in a xy-plot + @type lineWidth: int + @ivar xMin: minimum value taken on the x-axis + @type xMin: int + @ivar xMax: maximum value taken on the x-axis + @type xMax: int + @ivar yMin: minimum value taken on the y-axis + @type yMin: int + @ivar yMax: maximum value taken on the y-axis + @type yMax: int + @ivar minimumX: minimum value allowed on the x-axis + @type minimumX: int + @ivar maximumX: maximum value allowed on the x-axis + @type maximumX: int + @ivar minimumY: minimum value allowed on the y-axis + @type minimumY: int + @ivar maximumY: maximum value allowed on the y-axis + @type maximumY: int + @ivar leftMargin: add some margin in the left part of the plot + @type leftMargin: float + @ivar rightMargin: add some margin in the right part of the plot + @type rightMargin: float + @ivar downMargin: add some margin at the top of the plot + @type downMargin: float + @ivar upMargin: add some margin at the bottom of the plot + @type upMargin: float + @ivar logX: use log scale on the x-axis + @type logX: boolean + @ivar logY: use log scale on the y-axis + @type logY: boolean + @ivar logZ: use log scale on the z-axis (the color) + @type logZ: boolean + @ival fill: if a value is not given, fill it with given value + @type fill: int + @ival bucket: cluster the data into buckets of given size + @type bucket: int + @ival seed: a random number + @type seed: int + @ival regression: plot a linear regression + @type regression: boolean + @ival legend: set the legend + @type legend: boolean + @ival legendBySide: set the legend outside of the plot + @type legendBySde: boolean + @ival xLabel: label for the x-axis + @type xLabel: string + @ival yLabel: label for the y-axis + @type yLabel: string + @ival title: title of the plot + @type title: string + @ival barplot: use a barplot representation instead + @type barplot: boolean + @ival points: use a point cloud instead + @type points: boolean + @ival heatPoints: use a colored point cloud instead + @type heatPoints: boolean + @ival axesLabels: change the names of the axes + @type axesLabels: vector of 2 int to string dict + @ival rotateAxesLabels: rotate the axes labels + @type rotateAxesLabels: dict of 2 boolean + @ival verbosity: verbosity of the class + @type verbosity: int + @ival keep: keep temporary files + @type keep: boolean + """ + + def __init__(self, fileName, verbosity = 0, keep = False): + """ + Constructor + @param fileName: name of the file to produce + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + @param keep: keep temporary files + @type keep: boolean + """ + self.nbColors = 9 + self.fileName = fileName + self.verbosity = verbosity + self.keep = keep + self.format = "png" + self.fill = None + self.bucket = None + self.lines = [] + self.names = [] + self.colors = [] + self.types = [] + self.lineWidth = 1 + self.xMin = None + self.xMax = None + self.yMin = None + self.yMax = None + self.seed = random.randint(0, 10000) + self.minimumX = None + self.maximumX = None + self.minimumY = None + self.maximumY = None + self.leftMargin = 0 + self.rightMargin = 0 + self.topMargin = 0 + self.bottomMargin = 0 + self.logX = False + self.logY = False + self.logZ = False + self.regression = False + self.width = 1000 + self.height = 500 + self.legend = False + self.legendBySide = False + self.xLabel = "" + self.yLabel = "" + self.title = None + self.points = False + self.heatPoints = False + self.barplot = False + self.axesLabels = {1: None, 2: None} + self.rotateAxesLabels = {1: False, 2: False} + self.linesToAddBox = "" + + def __del__(self): + """ + Destructor + Remove tmp files + """ + if not self.keep: + scriptFileName = "tmpScript-%d.R" % (self.seed) + if os.path.exists(scriptFileName): + os.remove(scriptFileName) + outputFileName = "%sout" % (scriptFileName) + if os.path.exists(outputFileName): + os.remove(outputFileName) + nbLines = len(self.lines) + (1 if self.heatPoints else 0) + for i in range(nbLines): + if os.path.exists("tmpData-%d-%d.dat" % (self.seed, i)): + os.remove("tmpData-%d-%d.dat" % (self.seed, i)) + + + def setMinimumX(self, xMin): + """ + Set the minimum value on the x-axis + @param xMin:minimum value on the x-axis + @type xMin: int + """ + self.minimumX = xMin + + + def setMaximumX(self, xMax): + """ + Set the maximum value on the x-axis + @param xMax: maximum value on the x-axis + @type xMax: int + """ + self.maximumX = xMax + + + def setMinimumY(self, yMin): + """ + Set the minimum value on the y-axis + @param yMin: minimum value on the y-axis + @type yMin: int + """ + self.minimumY = yMin + + + def setMaximumY(self, yMax): + """ + Set the maximum value on the y-axis + @param yMax: maximum value on the y-axis + @type xmax: int + """ + self.maximumY = yMax + + + def setFill(self, fill): + """ + Fill empty data with given value + @param fill: the value to fill with + @type fill: int + """ + self.fill = fill + + + def setBuckets(self, bucket): + """ + Cluster the data into buckets of given size + @param bucket: the size of the buckets + @type bucket: int + """ + self.bucket = bucket + + + def setRegression(self, regression): + """ + Plot a linear regression line + @param regression: whether to plot the regression + @type regression: bool + """ + self.regression = regression + + + def setFormat(self, format): + """ + Set the format of the picture + @param format: the format + @type format: string + """ + if format not in ("png", "pdf", "jpeg", "bmp", "tiff"): + raise Exception("Format '%s' is not supported by RPlotter" % (format)) + self.format = format + + + def setWidth(self, width): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + """ + self.width = width + + + def setHeight(self, height): + """ + Set the dimensions of the image produced + @param height: heigth of the image + @type height: int + """ + self.height = height + + + def setImageSize(self, width, height): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + @param height: heigth of the image + @type height: int + """ + self.setWidth(width) + self.setHeight(height) + + + def setLegend(self, legend, bySide = False): + """ + Print a legend or not + @param legend: print a legend + @type legend: boolean + @param bySide: put the legend outside of the plot + @type bySide: boolean + """ + self.legend = legend + self.legendBySide = bySide + + + def setXLabel(self, label): + """ + Print a label for the x-axis + @param label: the label + @type label: string + """ + self.xLabel = label + if self.xLabel != None: + self.xLabel = self.xLabel.replace("_", " ") + + + def setYLabel(self, label): + """ + Print a label for the y-axis + @param label: the label + @type label: string + """ + self.yLabel = label + if self.yLabel != None: + self.yLabel = self.yLabel.replace("_", " ") + + + def addLeftMargin(self, margin): + """ + Increase the size of the space on the left part of the graph + @param margin: the space added + @type margin: float + """ + self.leftMargin = margin + + + def addRightMargin(self, margin): + """ + Increase the size of the space on the right part of the graph + @param margin: the space added + @type margin: float + """ + self.rightMargin = margin + + + def addTopMargin(self, margin): + """ + Increase the size of the space at the top of the graph + TopMargin is a percentage if 0 < TopMargin < 1. + TopMargin is a value if TopMargin >= 1. + @param margin: the space added + @type margin: float + """ + self.topMargin = margin + + + def addBottomMargin(self, margin): + """ + Increase the size of the space at the bottom of the graph + @param margin: the space added + @type margin: float + """ + self.bottomMargin = margin + + + def getNewYMaxWithTopMargin(self): + """ + Return new xMin coordinate with left margin + @param xMin: coordinate + @type xMin: float + """ + yMax = self.yMax + if 0 < self.topMargin and self.topMargin < 1: + topMargin = self.topMargin * self.yMax + yMax = self.yMax + topMargin + elif self.topMargin >= 1: + yMax = self.yMax + self.topMargin + return yMax + + + def setTitle(self, title): + """ + Print a title for graph + @param title: a title + @type title: string + """ + self.title = title + if self.title != None: + self.title = self.title.replace("_", " ") + + + def setAxisLabel(self, i, labels): + """ + Change x- or y-labels + @param i: x for x-label, y for y-label + @type i: string + @param labels: new labels + @type labels: int to string dict + """ + i = i.lower() + if i not in ("x", "y"): + raise Exception("Label name '" + i + "' should by 'x' or 'y' while changing axis labels.") + self.axesLabels[{"x": 1, "y": 2}[i]] = labels + + + def rotateAxisLabel(self, i, b = True): + """ + Rotate x- or y-labels + @param i: x for x-label, y for y-label + @type i: string + @param b: whether the labels should be rotated + @type b: boolean + """ + i = i.lower() + if i not in ("x", "y"): + raise Exception("Label name '" + i + "' should by 'x' or 'y' while rotating axis labels.") + self.rotateAxesLabels[{"x": 1, "y": 2}[i]] = b + + def setLineWidth(self, width): + """ + Set the line width in a xy-plot + @param width: the new line width + @type width: int + """ + self.lineWidth = width + + def setLog(self, log): + """ + Use log-scale for axes + @param log: use log scale + @type log: boolean + """ + self.logX = ("x" in log) + self.logY = ("y" in log) + self.logZ = ("z" in log) + + + def setBarplot(self, barplot): + """ + Use barplot representation instead + @param barplot: barplot representation + @type barplot: boolean + """ + self.barplot = barplot + + + def setPoints(self, points): + """ + Use points cloud representation instead + @param points: points cloud representation + @type points: boolean + """ + self.points = points + + + def setHeatPoints(self, heatPoints): + """ + Use points cloud representation with color representing another variable instead + @param points: colored points cloud representation + @type points: boolean + """ + self.heatPoints = heatPoints + + + def addBox(self, lXCoordList, minY, maxY): + for lXCoord in lXCoordList: + self.linesToAddBox += "rect(%s,%s,%s,%s,density=50, col='grey',border='transparent')\n" % (lXCoord[0], minY, lXCoord[1], maxY) + + def addLine(self, line, name = "", color = None): + """ + Add a line + @param line: a line to plot + @type line: dict + """ + # prepare data + plot = [] + if self.points or self.heatPoints: + values = line.values() + elif self.fill == None: + values = sorted(line.keys()) + else: + values = range(min(line.keys()), max(line.keys()) + 1) + + for element in values: + if self.points or self.heatPoints: + x = element[0] + y = element[1] + else: + x = element + if x not in line: + y = self.fill + else: + y = line[x] + + if self.minimumX != None and x < self.minimumX: + continue + if self.maximumX != None and x > self.maximumX: + continue + + if x == None: + raise Exception("Problem! x is None. Aborting...") + if y == None: + raise Exception("Problem! y is None. Aborting...") + if x == 0 and self.logX: + x = minPositiveValue + if y == 0 and self.logY: + y = minPositiveValue + if self.xMin == None: + if not self.logX or x != 0: + self.xMin = x + else: + if not self.logX or x != 0: + self.xMin = min(self.xMin, x) + if self.xMax == None: + self.xMax = x + else: + self.xMax = max(self.xMax, x) + if self.yMin == None: + if not self.logY or y != 0: + self.yMin = y + else: + if not self.logY or y != 0: + if y != "NA": + self.yMin = min(self.yMin, y) + if self.yMax == None: + self.yMax = y + else: + if y != "NA": + self.yMax = max(self.yMax, y) + + plot.append((x, y)) + + # cluster the data into buckets + if self.bucket != None: + buckets = dict([((int(value) / int(self.bucket)) * self.bucket, 0) for value in xrange(min(line.keys()), max(line.keys())+1)]) + for distance, nb in line.iteritems(): + buckets[(int(distance) / int(self.bucket)) * self.bucket] += nb + self.yMax = max(buckets.values()) + plot = [] + for x, y in buckets.iteritems(): + plot.append((x, y)) + + # write file + dataFileName = "tmpData-%d-%d.dat" % (self.seed, len(self.lines)) + dataHandle = open(dataFileName, "w") + if not self.heatPoints: + plot.sort() + for (x, y) in plot: + if y != "NA": + dataHandle.write("%f\t%f\n" % (x, y)) + else: + dataHandle.write("%f\t%s\n" % (x, y)) + dataHandle.close() + + self.lines.append(line) + self.names.append(name) + + if color == None: + colorNumber = len(self.colors) % (self.nbColors - 1) + 1 + type = "solid" + if len(self.colors) >= self.nbColors: + type = "dashed" + color = "colorPanel[%d]" % (colorNumber) + else: + color = "\"%s\"" % (color) + type = "solid" + self.colors.append(color) + self.types.append(type) + + + def addHeatLine(self, line, name = "", color = None): + """ + Add the heat line + @param line: the line which gives the color of the points + @type line: dict + """ + if not self.heatPoints: + raise Exception("Error! Trying to add a heat point whereas not mentioned to earlier! Aborting.") + + dataFileName = "tmpData-%d-%d.dat" % (self.seed, len(self.lines)) + dataHandle = open(dataFileName, "w") + + minimumHeat = min(line.values()) + maximumHeat = max(line.values()) + minLogValue = 0.00001 + log = self.logZ + + if log: + if minimumHeat == 0: + for element in line: + line[element] += minLogValue + minimumHeat += minLogValue + maximumHeat += minLogValue + minimumHeat = math.log10(minimumHeat) + maximumHeat = math.log10(maximumHeat) + + coeff = 255.0 / (maximumHeat - minimumHeat) + + for element in line: + value = line[element] + if log: + value = math.log10(max(minLogValue, value)) + dataHandle.write("\"#%02X%02X00\"\n" % (int((value - minimumHeat) * coeff), 255 - int((value - minimumHeat) * coeff))) + + dataHandle.close() + self.names.append(name) + if color == None: + colorNumber = len(self.colors) % (self.nbColors - 1) + 1 + type = "solid" + if len(self.colors) >= self.nbColors: + type = "dashed" + color = "colorPanel[%d]" % (colorNumber) + else: + color = "\"%s\"" % (color) + type = "solid" + self.colors.append(color) + self.types.append(type) + + + def getScript(self): + """ + Write (unfinished) R script + """ + script = "" + + xMin = self.xMin - self.leftMargin + if self.minimumX != None: + xMin = max(xMin, self.minimumX) + xMax = self.xMax + self.rightMargin + if self.maximumX != None: + xMax = min(xMax, self.maximumX) + yMin = self.yMin - self.bottomMargin + if self.minimumY != None: + yMin = self.minimumY + yMax = self.getNewYMaxWithTopMargin() + yMax += min(1, yMax / 100.0) + if self.maximumY != None: + yMax = self.maximumY + + log = "" + if self.logX: + log += "x" + if self.logY: + log += "y" + if log != "": + log = ", log=\"%s\"" % (log) + + title = "" + if self.title != None: + title = ", main = \"%s\"" % (self.title) + + if self.legend and self.legendBySide: + script += "layout(matrix(c(1,2), 1, 2), widths=c(5,1))\n" + + if self.rotateAxesLabels[2]: + script += "par(mar=c(5,12,4,2))\n" + else: + script += "par(mar=c(5,5,4,2))\n" + + addAxes = True + + if self.barplot: + script += "data = scan(\"tmpData-%d-0.dat\", list(x = -666, y = -666))\n" % (self.seed) + if len(self.lines) == 1: + script += "barplot(data$y, name = data$x, xlab=\"%s\", ylab=\"%s\", ylim = c(%f, %f), cex.axis = 2, cex.names = 2, cex.lab = 2%s%s)\n" % (self.xLabel, self.yLabel, yMin, yMax, title, log) + addAxes = False + else: + script += "data1 = scan(\"tmpData-%d-1.dat\", list(x = -666, y = -666))\n" % (self.seed) + script += "barplot(rbind(data$y, data1$y), name = data$x, xlab=\"%s\", ylab=\"%s\", cex.axis = 2, cex.names = 2, cex.lab = 2%s, beside = TRUE, space=c(-1,0), axes = FALSE%s)\n" % (self.xLabel, self.yLabel, title, log) + elif self.points: + script += "data = scan(\"tmpData-%d-0.dat\", list(x = -666, y = -666))\n" % (self.seed) + script += "plot(data$x, data$y, xlab=\"%s\", ylab=\"%s\", cex.axis = 2, cex.lab = 2, axes = FALSE%s%s)\n" % (self.xLabel, self.yLabel, title, log) + if self.regression: + x = "log10(data$x)" if self.logX else "data$x" + y = "log10(data$y)" if self.logY else "data$y" + script += "abline(lm(%s ~ %s))\n" % (y, x) + elif self.heatPoints: + if len(self.lines) != 1: + raise Exception("Error! Bad number of input data! Aborting...") + script += "data = scan(\"tmpData-%d-0.dat\", list(x = -666, y = -666))\n" % (self.seed) + script += "heatData = scan(\"tmpData-%d-1.dat\", list(x = \"\"))\n" % (self.seed) + script += "plot(data$x, data$y, col=heatData$x, xlab=\"%s\", ylab=\"%s\", cex.axis = 2, cex.lab = 2, axes = FALSE%s%s)\n" % (self.xLabel, self.yLabel, title, log) + if self.regression: + x = "log10(data$x)" if self.logX else "data$x" + y = "log10(data$y)" if self.logY else "data$y" + script += "abline(lm(%s ~ %s))\n" % (y, x) + else: + script += "plot(x = NA, y = NA, panel.first = grid(lwd = 1.0), xlab=\"%s\", ylab=\"%s\", xlim = c(%f, %f), ylim = c(%f, %f), cex.axis = 2, cex.lab = 2, axes = FALSE%s%s)\n" % (self.xLabel, self.yLabel, xMin, xMax, yMin, yMax, title, log) + for i in range(0, len(self.lines)): + script += "data = scan(\"tmpData-%d-%d.dat\", list(x = -666.666, y = -666.666))\n" % (self.seed, i) + script += "lines(x = data$x, y = data$y, col = %s, lty = \"%s\", lwd = %d)\n" % (self.colors[i], self.types[i], self.lineWidth) + + script += self.linesToAddBox + + if addAxes: + for i in self.axesLabels: + rotation = ", las = 2" if self.rotateAxesLabels[i] else "" + if self.axesLabels[i] == None: + script += "axis(%d, cex.axis = 2, cex.lab = 2%s)\n" % (i, rotation) + else: + oldKeys = ", ".join(["%d" % (key) for key in sorted(self.axesLabels[i].keys())]) + newKeys = ", ".join(["\"%s\"" % (self.axesLabels[i][key]) for key in sorted(self.axesLabels[i].keys())]) + script += "axis(%d, at=c(%s), lab=c(%s), cex.axis = 2, cex.lab = 2%s)\n" % (i, oldKeys, newKeys, rotation) + script += "box()\n" + + if self.legend: + if self.legendBySide: + script += "plot.new()\n" + script += "par(mar=c(0,0,0,0))\n" + script += "plot.window(c(0,1), c(0,1))\n" + script += "legends = c(%s)\n" % ", ".join(["\"%s\"" % name for name in self.names]) + script += "colors = c(%s)\n" % ", ".join(["%s" % color for color in self.colors]) + script += "lineTypes = c(%s)\n" % ", ".join(["\"%s\"" % type for type in self.types]) + if self.legendBySide: + script += "legend(0, 1, legend = legends, xjust = 0, yjust = 1, col = colors, lty = lineTypes, lwd = %d, cex = 1.5, ncol = 1, bg = \"white\")\n" % (self.lineWidth) + else: + script += "legend(\"topright\", legend = legends, xjust = 0, yjust = 1, col = colors, lty = lineTypes, lwd = %d, cex = 1.5, ncol = 1, bg = \"white\")\n" % (self.lineWidth) + + return script + + + + def plot(self): + """ + Plot the lines + """ + scriptFileName = "tmpScript-%d.R" % (self.seed) + scriptHandle = open(scriptFileName, "w") + scriptHandle.write("library(RColorBrewer)\n") + scriptHandle.write("colorPanel = brewer.pal(n=%d, name=\"Set1\")\n" % (self.nbColors)) + scriptHandle.write("%s(%s = \"%s\", width = %d, height = %d, bg = \"white\")\n" % (self.format, "filename" if self.format != "pdf" else "file", self.fileName, self.width, self.height)) + scriptHandle.write(self.getScript()) + scriptHandle.write("dev.off()\n") + scriptHandle.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + + def getCorrelationData(self): + if not self.regression: + return "" + scriptFileName = "tmpScript-%d.R" % (self.seed) + rScript = open(scriptFileName, "w") + rScript.write("data = scan(\"tmpData-%d-0.dat\", list(x = -0.000000, y = -0.000000))\n" % (self.seed)) + x = "log10(data$x)" if self.logX else "data$x" + y = "log10(data$y)" if self.logY else "data$y" + rScript.write("summary(lm(%s ~ %s))\n" % (y, x)) + rScript.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s computing the correlation, status is: %s" % (scriptFileName, status)) + outputRFile = open("%sout" % (scriptFileName)) + output = "" + start = False + end = False + for line in outputRFile: + if start and "> " in line: + end = True + if start and not end: + output += line + if "summary" in line: + start = True + return output + + + def getSpearmanRho(self): + """ + Get the Spearman rho correlation using R + """ + return None + if not self.points and not self.barplot and not self.heatPoints: + raise Exception("Cannot compute Spearman rho correlation whereas not in 'points' or 'bar' mode.") + + scriptFileName = "tmpScript-%d.R" % (self.seed) + rScript = open(scriptFileName, "w") + rScript.write("library(Hmisc)\n") + rScript.write("data = scan(\"tmpData-%d-0.dat\", list(x = -0.000000, y = -0.000000))\n" % (self.seed)) + rScript.write("spearman(data$x, data$y)\n") + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFile = open("%sout" % (scriptFileName)) + nextLine = False + for line in outputRFile: + line = line.strip() + if nextLine: + if line == "NA": + return None + return float(line) + nextLine = False + if line == "rho": + nextLine = True + + return None diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/misc/UnlimitedProgress.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/UnlimitedProgress.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,81 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import time + +class UnlimitedProgress(object): + """Show the progress of a process when no upper bound is known""" + + def __init__(self, step = 1000, message = "Progress", verbosity = 0): + self.step = step + self.progress = 0 + self.message = message + self.verbosity = verbosity + self.maxMessageSize = 50 + self.startTime = time.time() + self.elapsed = 0 + if len(self.message) > self.maxMessageSize: + self.message = self.message[0:self.maxMessageSize-3] + "..." + self.show() + + + def inc(self): + self.progress += 1 + self.show() + + + def getPrintableElapsedTime(self, time): + timeHou = int(time) / 3600 + timeMin = int(time) / 60 - 60 * timeHou + timeSec = int(time) % 60 + if timeHou > 0: + return "%3dh %2dm" % (timeHou, timeMin) + if timeMin > 0: + return "%2dm %2ds" % (timeMin, timeSec) + return "%2ds" % (timeSec) + + + def show(self): + if self.verbosity <= 0: + return + elapsed = int(time.time() - self.startTime) + if (self.progress % self.step == 0) or (elapsed > self.elapsed + 10): + self.elapsed = elapsed + string = "%s %d -- time spent: %s\r" % (self.message, self.progress, self.getPrintableElapsedTime(elapsed)) + sys.stdout.write(string) + sys.stdout.flush() + + + def done(self): + if self.verbosity > 0: + elapsed = time.time() - self.startTime + string = "%s %d -- time spent: %s\r" % (self.message, self.progress, self.getPrintableElapsedTime(elapsed)) + print string + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/misc/Utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/Utils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,271 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Some useful functions""" + +import sys, os +import random +import subprocess + + +def writeFile(fileName, content): + """ + Write the content of a file + """ + handle = open(fileName, "w") + handle.write(content) + handle.close() + +def sumOfLists(list1, list2): + """ + Element by element sum + """ + if len(list1) != len(list2): + sys.exit("Cannot sum list whose sizes are different!") + return [list1[i] + list2[i] for i in range(len(list1))] + + +def protectBackslashes(string): + """ + Protect the backslashes in a path by adding another backslash + """ + return string.replace("\\", "\\\\") + + +def getHammingDistance(string1, string2): + """ + Compute Hamming distance between two strings + """ + if len(string1) != len(string2): + raise Exception("Error, size of %s and %s differ" % (string1, string2)) + return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2)) + + +def getLevenshteinDistance(string1, string2): + """ + Compute Levenshtein distance between two strings + """ + if len(string1) < len(string2): + return getLevenshteinDistance(string2, string1) + if not string1: + return len(string2) + previousRow = xrange(len(string2) + 1) + for i, c1 in enumerate(string1): + currentRow = [i + 1] + for j, c2 in enumerate(string2): + insertions = previousRow[j + 1] + 1 + deletions = currentRow[j] + 1 + substitutions = previousRow[j] + (c1 != c2) + currentRow.append(min(insertions, deletions, substitutions)) + previousRow = currentRow + return previousRow[-1] + + +def getMinAvgMedMax(values): + """ + Get some stats about a dict + @param values: a distribution (the value being the number of occurrences of the key) + @type values: dict int to int + @return: a tuple + """ + minValues = min(values.keys()) + maxValues = max(values.keys()) + sumValues = sum([value * values[value] for value in values]) + nbValues = sum(values.values()) + allValues = [] + for key in values: + for i in range(values[key]): + allValues.append(key) + sortedValues = sorted(allValues) + sorted(values.values()) + if (nbValues % 2 == 0): + medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0 + else: + medValues = sortedValues[(nbValues + 1) / 2 - 1] + return (minValues, float(sumValues) / nbValues, medValues, maxValues) + + +def xor(value1, value2): + """ + Logical xor + @param value1: a value + @type value1: anything + @param value2: a value + @type value2: anything + """ + return bool(value1) != bool(value2) + + +def diff(fileName1, fileName2): + """ + Compare two files + @param fileName1: a file name + @type fileName1: string + @param fileName2: another file name + @type fileName2: string + @return: None if the files are the same, a string otherwise + """ + handle1 = open(fileName1) + lines1 = handle1.readlines() + handle2 = open(fileName2) + lines2 = handle2.readlines() + if len(lines1) != len(lines2): + print "Sizes of files differ (%d != %d)" % (len(lines1), len(lines2)) + return False + for i in xrange(len(lines1)): + if lines1[i] != lines2[i]: + print "Line %d differ ('%s' != '%s')" % (i, lines1[i].strip(), lines2[i].strip()) + return False + return True + + +def binomialCoefficient(a, b): + """ + Compute cumulated product from a to b + @param a: a value + @type a: int + @param b: a value + @type b: int + """ + if a > b / 2: + a = b-a + p = 1.0 + for i in range(b-a+1, b+1): + p *= i + q = 1.0 + for i in range(1, a+1): + q *= i + return p / q + + +memory = {} + +# def fisherExactPValue(a, b, c, d): +# """ +# P-value of Fisher exact test for 2x2 contingency table +# """ +# if (a, b, c, d) in memory: +# return memory[(a, b, c, d)] + +# n = a + b + c + d +# i1 = binomialCoefficient(a, a+b) +# i2 = binomialCoefficient(c, a+c) +# i3 = binomialCoefficient(c+d, n) +# pValue = i1 * i2 / i3 + +# memory[(a, b, c, d)] = pValue + +# return pValue + + +def fisherExactPValue(a, b, c, d): + if (a, b, c, d) in memory: + return memory[(a, b, c, d)] + + scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) + rScript = open(scriptFileName, "w") + rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\n" % (a, b, c, d)) + rScript.write("fisher.test(data)\n") + #rScript.write("chisq.test(data)\n") + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFileName = "%sout" % (scriptFileName) + outputRFile = open(outputRFileName) + pValue = None + pValueTag = "p-value " + for line in outputRFile: + line = line.strip() + if line == "": continue + for splittedLine in line.split(","): + splittedLine = splittedLine.strip() + if splittedLine.startswith(pValueTag): + pValue = float(splittedLine.split()[-1]) + break + + if pValue == None: + sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d)) + + os.remove(scriptFileName) + os.remove(outputRFileName) + + memory[(a, b, c, d)] = pValue + + return pValue + + +def fisherExactPValueBulk(list): + + scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000)) + rScript = open(scriptFileName, "w") + for element in list: + rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3]))) + rScript.close() + + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, scriptFileName) + status = subprocess.call(command, shell=True) + + if status != 0: + sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status)) + + outputRFileName = "%sout" % (scriptFileName) + outputRFile = open(outputRFileName) + pValue = None + pValueTag = "[1] " + results = {} + cpt = 0 + for line in outputRFile: + line = line.strip() + if line == "": continue + if line.startswith(pValueTag): + pValue = float(line.split()[-1]) + results[list[cpt][0:2]] = pValue + cpt += 1 + + if pValue == None: + sys.exit("Problem with the cannot find p-value!") + if cpt != len(list): + sys.exit("Error in the number of p-values computed by R in file '%s'!" % (scriptFileName)) + + os.remove(scriptFileName) + os.remove(outputRFileName) + + return results + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/misc/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/modifyFasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/modifyFasta.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,62 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the content of a FASTA file""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Sequence List v1.0.1: Extend or shring a list of sequences. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="keep first nucleotides [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="keep last nucleotides [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbSequences(), "Reading file %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + if options.start != None: + sequence.shrinkToFirstNucleotides(options.start) + if options.end != None: + sequence.shrinkToLastNucleotides(options.end) + writer.addSequence(sequence) + progress.inc() + progress.done() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/modifyGenomicCoordinates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/modifyGenomicCoordinates.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,80 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the genomic coordinates of a file""" + +from optparse import OptionParser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Genomic Coordinates v1.0.1: Extend or shrink a list of genomic coordinates. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="restrict to the start of the transcript [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="restrict to the end of the transcript [format: int]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store", default=None, type="int", help="extend to the 5' direction [format: int]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store", default=None, type="int", help="extend to the 3' direction [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + writer = TranscriptWriter(options.outputFileName, "gff3", options.verbosity) + + nbItems = 0 + nbItems = parser.getNbItems() + print "%i items found" % (nbItems) + + progress = Progress(nbItems, "Analyzing sequences of " + options.inputFileName, options.verbosity) + for transcript in parser.getIterator(): + if options.start != None: + transcript.restrictStart(options.start) + if options.end != None: + transcript.restrictEnd(options.end) + if options.fivePrime != None: + transcript.extendStart(options.fivePrime) + if options.threePrime != None: + transcript.extendEnd(options.threePrime) + + writer.addTranscript(transcript) + + progress.inc() + progress.done() + + writer.write() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/modifySequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/modifySequenceList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,72 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the content of a FASTA file""" +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Sequence List v1.0.1: Extend or shring a list of sequences. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName",action="store", type="string", help="input file [compulsory] [format: file in format given by -f]") + parser.add_option("-o", "--output", dest="outputFileName", action="store",default=None, type="string", help="output file [compulsory] [format: output file in format given by -f]") + parser.add_option("-f", "--format", dest="format",action="store",type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None,type="int",help="keep first nucleotides [format: int]") + parser.add_option("-e", "--end", dest="end", action="store",default=None,type="int",help="keep last nucleotides [format: int]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + sys.exit("Do not understand '%s' file format." % (options.format)) + + progress = Progress(parser.getNbSequences(), "Reading file %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + if options.start != None: + sequence.shrinkToFirstNucleotides(options.start) + if options.end != None: + sequence.shrinkToLastNucleotides(options.end) + writer.addSequence(sequence) + progress.inc() + progress.done() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mySql/MySqlConnection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlConnection.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,137 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +#! /usr/bin/env python +import os +import random +import sqlite3 +from SMART.Java.Python.mySql.MySqlQuery import MySqlQuery + + +class MySqlConnection(object): + """Connection to a database""" + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.databaseName = os.path.join(os.environ.get("SMARTTMPPATH", "."), "smartdb%d" % random.randint(0, 100000)) + self.connection = sqlite3.connect(self.databaseName) + self.executeQuery("PRAGMA journal_mode = OFF") + self.executeQuery("PRAGMA synchronous = 0") + self.executeQuery("PRAGMA locking_mode = EXCLUSIVE") + self.executeQuery("PRAGMA count_change = OFF") + self.executeQuery("PRAGMA temp_store = 2") + + def __del__(self): + self.connection.close() + + + def createDatabase(self): + pass + + + def deleteDatabase(self): + if os.path.exists(self.databaseName): + os.remove(self.databaseName) + + + def executeQuery(self, command, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + result = query.execute(command, insertion) + self.connection.commit() + except: + result = query.execute(command, insertion) + self.connection.commit() + if insertion: + return result + else: + return query + + + def executeManyQueries(self, commands): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for cpt, command in enumerate(commands): + query.execute(command) + self.connection.commit() + except: + for cpt, command in enumerate(commands): + query.execute(command) + self.connection.commit() + + + def executeManyFormattedQueries(self, command, lines, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + for line in lines: + result = query.executeFormat(command, line) + self.connection.commit() + if insertion: + return result + else: + return query + + + def executeManyQueriesIterator(self, table): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for command in table.getIterator(): + query.execute(command) + self.connection.commit() + except: + for command in table.getIterator(): + query.execute(command) + self.connection.commit() + + + def executeManyFormattedQueriesIterator(self, table): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for command, values in table.getIterator(): + query.executeFormat(command, values) + self.connection.commit() + except: + for command, values in table.getIterator(): + query.execute(command, values) + self.connection.commit() + + + def executeFormattedQuery(self, command, parameters, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + result = query.executeFormat(command, parameters) + self.connection.commit() + if insertion: + return result + else: + return query \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mySql/MySqlExonTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlExonTable.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,97 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.mySql.MySqlTable import MySqlTable + + +class MySqlExonTable(MySqlTable): + """A table of exon in a mySQL database""" + + def __init__(self, connection, name = None, chromosome = None, verbosity = 0): + if chromosome == None: + chromosome = "" + else: + chromosome = "_%s" % chromosome + if name == None: + name = "TmpTable_%d" % (random.randint(0, 100000)) + name = "%s%s_exons" % (name, chromosome) + super(MySqlExonTable, self).__init__(connection, name, verbosity) + + + def createExonTable(self): + variables = Interval.getSqlVariables() + variables.append("transcriptId") + types = Interval.getSqlTypes() + types["transcriptId"] = "int" + sizes = Interval.getSqlSizes() + sizes["transcriptId"] = 11 + self.create(variables, types, sizes) + + + def rename(self, name): + super(MySqlExonTable, self).rename("%s_exons" % name) + + + def addExon(self, exon, transcriptId): + values = exon.getSqlValues() + values["transcriptId"] = transcriptId + id = self.addLine(values) + exon.id = id + + + def retrieveExonsFromTranscriptId(self, transcriptId): + if not self.created: + return [] + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE transcriptId = %d" % (self.name, transcriptId)) + exons = [] + for exonLine in query.getIterator(): + exon = Interval() + exon.setSqlValues(exonLine) + exons.append(exon) + return exons + + + def retrieveExonsFromBulkTranscriptIds(self, transcriptIds): + if not transcriptIds: + return {} + if not self.created: + return {} + exons = dict([(transcriptId, []) for transcriptId in transcriptIds]) + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE transcriptId IN (%s)" % (self.name, ", ".join(["%s" % (transcriptId) for transcriptId in transcriptIds]))) + for exonLine in query.getIterator(): + exon = Interval() + exon.setSqlValues(exonLine) + exons[exonLine[-1]].append(exon) + return exons + + + def removeFromTranscriptId(self, transcriptId): + self.mySqlConnection.executeQuery("DELETE FROM %s WHERE transcriptId = %d" % (self.name, transcriptId)) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mySql/MySqlQuery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlQuery.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,94 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class MySqlQuery(object): + """Query to a database""" + + def __init__(self, cursor, verbosity = 0): + self.verbosity = verbosity + self.cursor = cursor + self.insertedId = None + + + def __del__(self): + self.cursor.close() + + + def execute(self, query, insertion = False): + if self.verbosity > 99: + print "Querying %s" % (query) + try: + results = self.cursor.execute(query) + except Exception: + raise Exception("Error! Command \"%s\" failed!" % (query)) + if insertion: + return self.cursor.lastrowid + return results + + + def executeFormat(self, query, parameters): + if self.verbosity > 99: + print "Querying %s |" % (query), + for parameter in parameters: + print parameter, + print + results = self.cursor.execute(query, parameters) + return results + + + def getLine(self): + return self.cursor.fetchone() + + + def getLines(self, lines = None): + if lines == None: + return self.cursor.fetchall() + return self.cursor.fetchmany(lines) + + + def isEmpty(self): + self.getLines() + return self.cursor.rowcount == None or self.cursor.rowcount == 0 + + + def getInsertedId(self): + return self.insertedId + + + def getIterator(self): + line = self.getLine() + while line != None: + yield line + line = self.getLine() + + + def show(self): + for line in self.getIterator(): + print line \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mySql/MySqlTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlTable.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,349 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys + +class MySqlTable(object): + """ + Store a table of a mySQL database, used for transcripts or exons + Record a a name and a type (int, float, double) for each column + @ivar name: name of the table + @type name: string + @ivar variables: name of the columns + @type variables: list of string + @ivar types: type of the columns + @type types: dict of string + @ivar mySqlConnection: connection to a database + @type mySqlConnection: class L{MySqlConnection} + @ivar nbLines: number of rows + @type nbLines: int + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, connection, name, verbosity = 0): + """ + Constructor + Possibly retrieve column names and types if table exists + @param mySqlConnection: connection to a databas + @type mySqlConnection: class L{MySqlConnection} + @param name: name of the table + @type name: string + @param verbosity: verbosity + @type verbosity: int + """ + self.name = name + self.variables = [] + self.types = {} + self.sizes = {} + self.nbLines = None + self.verbosity = verbosity + self.mySqlConnection = connection + queryTables = self.mySqlConnection.executeQuery("SELECT name FROM sqlite_master WHERE type LIKE 'table' AND name LIKE '%s'" % (self.name)) + self.created = not queryTables.isEmpty() + if self.created: + queryFields = self.mySqlConnection.executeQuery("PRAGMA table_info('%s')" % (name)) + for field in queryFields.getIterator(): + if field[1] != "id": + self.variables.append(field[1]) + self.types[field[1]] = field[2] + self.sizes[field[1]] = field[3] + + + def getName(self): + return self.name + + + def create(self, variables, types, sizes): + """ + Create a table using give column names and types + @param variables: names of the columns + @type variables: list of string + @param types: types of the columns + @type types: dict of string + @param sizes: sizes of the types + @type sizes: dict of int + """ + self.variables = variables + self.types = types + self.sizes = sizes + if self.created: + self.remove() + query = "CREATE TABLE '%s' (id INTEGER PRIMARY KEY" % (self.name) + for variable in variables: + query = "%s, %s %s(%d)" % (query, variable, types[variable], sizes[variable]) + query += ")" + self.mySqlConnection.executeQuery(query) + self.created = True + + + def insertMany(self, lines): + """ + Insert many lines + @param lines: the list of values + @type lines: list of lists + """ + commands = [] + for values in lines: + commands.append("INSERT INTO '%s' (%s) VALUES (%s)" % (self.name, ", ".join(self.variables), ", ".join([MySqlTable.formatSql(values[variable], self.types[variable], self.sizes[variable]) for variable in self.variables]))) + self.mySqlConnection.executeManyQueries(commands) + + + def insertManyFormatted(self, lines): + """ + Insert many lines + @param lines: the list of values + @type lines: list of lists + """ + replacer = ["?"] * len(self.variables) + command = "INSERT INTO '%s' (%s) VALUES (%s)" % (self.name, ", ".join(self.variables), ", ".join(replacer)) + values = [[line[variable] for variable in self.variables] for line in lines] + self.mySqlConnection.executeManyFormattedQueries(command, values) + + + def rename(self, name): + """ + Rename the table + @param name: the new name + @type name: string + """ + self.mySqlConnection.executeQuery("RENAME TABLE '%s' TO '%s'" % (self.name, name)) + self.name = name + + + def copy(self, table): + """ + Copy the given table this one + @param table: the table to be copied + @type table: class L{MySqlTable} + """ + variables = [] + types = {} + sizes = {} + fields = self.mySqlConnection.executeQuery("PRAGMA table_info(%s)" % (table.name)) + for field in fields.getIterator(): + if field[1] != "id": + variables.append(field[1]) + m = re.search(r"(\w+)\((\d+)\)", field[2]) + if m == None: + raise Exception("\nFormat %s in table %s is strange." % (field[2], table.name)) + types[field[1]] = m.group(1) + sizes[field[1]] = int(m.group(2)) + self.create(variables, types, sizes) + self.mySqlConnection.executeQuery("INSERT INTO '%s' SELECT * FROM %s" % (self.name, table.name)) + + + def add(self, table): + """ + Add the content of a table to this one + @param table: the table to be added + @type table: class L{MySqlTable} + """ + self.mySqlConnection.executeQuery("INSERT INTO '%s' SELECT * FROM %s" % (self.name, table.name)) + self.created = True + + + def exists(self): + """ + Check if the table exists in mySQL + @return: true if it exits + """ + return self.created + + + def remove(self): + """ + Remove this table + """ + if self.exists(): + query = "DROP TABLE IF EXISTS '%s'" % (self.name) + self.mySqlConnection.executeQuery(query) + self.created = False + + + def clear(self): + """ + Clear the content of this table + """ + self.mySqlConnection.executeQuery("DELETE FROM '%s'" % (self.name)) + + + def getNbElements(self): + """ + Count the number of rows in the table + """ + command = "SELECT COUNT(id) FROM '%s'" % (self.name) + query = self.mySqlConnection.executeQuery(command) + return int(query.getLine()[0]) + + + def formatSql(self, value, type, size): + """ + Format a value using MySQL encapsulation + """ + if type.find("int") != -1: + return "%d" % value + if type.find("float") != -1: + return "%.10f" % value + if type.find("double") != -1: + return "%.20f" % value + if type.find("varchar") != -1: + if len(value) > size: + return "'%s'" % value[0:size] + return "'%s'" % value + raise Exception("Do not understand type %s" % (type)) + formatSql = classmethod(formatSql) + + + def addLine(self, values): + """ + Add a row to this table + @param values: the values of the row + @type values: dict + @return: the id of the added row + """ + sqlValues = [values[variable] for variable in self.variables] + command = "INSERT INTO '%s' (%%s) VALUES (%s)" % (self.name, ", ".join(self.variables)) + id = self.mySqlConnection.executeFormattedQueryQuery(command, sqlValues, True) + return id + sqlValues = [] + for variable in self.variables: + sqlValues.append(self.formatSql(values[variable], self.types[variable], self.sizes[variable])) + command = "INSERT INTO '%s' (%s) VALUES (%s)" % (self.name, ", ".join(self.variables), ", ".join(sqlValues)) + id = self.mySqlConnection.executeQuery(command, True) + return id + + + def retrieveFromId(self, id): + """ + Retrieve a row from its id + @param id: the id of the row + @type id: int + @return: the row + """ + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s' WHERE id = %d" % (self.name, id)) + result = query.getLine() + if result == None: + raise Exception("Error! Id %d is not in the table %s!" % (id, self.name)) + return result + + + def retrieveBulkFromId(self, ids): + """ + Retrieve a row from its id + @param id: the ids of the row + @type id: list of int + @return: the row + """ + if not ids: + return [] + MAXSIZE = 1000 + results = [] + for batch in range(len(ids) / MAXSIZE + 1): + theseIds = ids[batch * MAXSIZE : (batch+1) * MAXSIZE] + if theseIds: + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s' WHERE id IN (%s)" % (self.name, ", ".join(["%d" % (id) for id in theseIds]))) + lines = query.getLines() + if len(lines) != len(theseIds): + raise Exception("Error! Some Ids of (%s) is are missing in the table '%s' (got %d instead of %d)!" % (", ".join(["%d" % (id) for id in theseIds]), self.name, len(lines)), len(theseIds)) + results.extend(lines) + return results + + + def removeFromId(self, id): + """ + Remove a row from its id + @param id: the id of the row + @type id: int + """ + self.mySqlConnection.executeQuery("DELETE FROM '%s' WHERE id = %d" % (self.name, id)) + + + def getIterator(self): + """ + Iterate on the content of table + @return: iterator to the rows of the table + """ + if not self.created: + return + MAXSIZE = 1000 + query = self.mySqlConnection.executeQuery("SELECT count(id) FROM '%s'" % (self.name)) + nbRows = int(query.getLine()[0]) + for chunk in range((nbRows / MAXSIZE) + 1): + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s' LIMIT %d, %d" % (self.name, chunk * MAXSIZE, MAXSIZE)) + for line in query.getIterator(): + yield line + + + def createIndex(self, indexName, values, unique = False, fullText = False): + """ + Add an index on the table + @param indexName: name of the index + @type indexName: string + @param values: values to be indexed + @type values: string + @param unique: if the index is unique + @type unique: boolean + @param fullText: whether full text should be indexed + @type fullText: boolean + """ + self.mySqlConnection.executeQuery("CREATE %s%sINDEX '%s' ON '%s' (%s)" % ("UNIQUE " if unique else "", "FULLTEXT " if fullText else "", indexName, self.name, ", ".join(values))) + + + def setDefaultTagValue(self, field, name, value): + """ + Add a tag value + @param name: name of the tag + @type name: string + @param value: value of the tag + @type value: string or int + """ + newData = {} + for line in MySqlTable.getIterator(self): + id = line[0] + tags = line[field] + if tags == '': + newTag = "%s=%s" % (name, value) + else: + newTag = "%s;%s=%s" % (tags, name, value) + if name not in [tag.split("=")[0] for tag in tags.split(";")]: + newData[id] = newTag + for id, tag in newData.iteritems(): + query = self.mySqlConnection.executeQuery("UPDATE '%s' SET tags = '%s' WHERE id = %i" % (self.name, tag, id)) + + + + def show(self): + """ + Drop the content of the current table + """ + query = self.mySqlConnection.executeQuery("SELECT * FROM '%s'" % (self.name)) + print query.getLines() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mySql/MySqlTranscriptTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlTranscriptTable.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,149 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.mySql.MySqlExonTable import MySqlExonTable +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class MySqlTranscriptTable(MySqlTable): + """A table of transcripts in a mySQL database""" + + def __init__(self, connection, name = None, chromosome = None, verbosity = 0): + if chromosome == None: + chromosome = "" + else: + chromosome = "_%s" % chromosome + if name == None: + name = "TmpTable_%d" % (random.randint(0, 100000)) + name = "%s%s" % (name, chromosome) + super(MySqlTranscriptTable, self).__init__(connection, "%s_transcripts" % name, verbosity) + + + def createTranscriptTable(self): + self.create(Transcript.getSqlVariables(), Transcript.getSqlTypes(), Transcript.getSqlSizes()) + + + def rename(self, name): + super(MySqlTranscriptTable, self).rename("%s_transcripts" % name) + + + def remove(self): + super(MySqlTranscriptTable, self).remove() + + + def clear(self): + super(MySqlTranscriptTable, self).clear() + + + def copy(self, transcriptTable): + self.remove() + super(MySqlTranscriptTable, self).copy(transcriptTable) + + + def add(self, transcriptTable): + super(MySqlTranscriptTable, self).add(transcriptTable) + + + def addTranscript(self, transcript): + id = self.addLine(transcript.getSqlValues()) + transcript.id = id + + + def addTranscriptList(self, transcriptList): + progress = Progress(transcriptList.getNbTranscript(), "Storing list to %s" % (self.name), self.verbosity) + for transcript in transcriptList.getIterator(): + self.addTranscript(transcript) + progress.inc() + progress.done() + + + def removeTranscript(self, transcript): + self.removeFromId(transcript.id) + + + def retrieveTranscriptFromId(self, id): + transcript = Transcript() + transcript.setSqlValues(self.retrieveFromId(id)) + return transcript + + + def retrieveBulkTranscriptFromId(self, ids): + if not ids: + return [] + transcripts = self.retrieveBulkFromId(ids) + idsToTranscripts = {} + for values in transcripts: + transcript = Transcript() + transcript.setSqlValues(values) + idsToTranscripts[values[0]] = transcript + return idsToTranscripts.values() + + + def selectTranscripts(self, command, simple = False): + MAXSIZE = 100000 + found = True + cpt = 0 + while found: + found = False + if simple: + thisCommand = command + else: + thisCommand = "%s LIMIT %d OFFSET %d" % (command, MAXSIZE, MAXSIZE * cpt) + query = self.mySqlConnection.executeQuery(thisCommand) + for line in query.getIterator(): + found = True + id = int(line[0]) + transcript = Transcript() + transcript.setSqlValues(line) + yield (id, transcript) + cpt += 1 + if simple: + return + + + def getIterator(self): + for id, transcript in self.selectTranscripts("SELECT * FROM '%s'" % (self.name)): + yield transcript + + + def retrieveTranscriptList(self): + transcriptList = TranscriptList() + for transcriptLine in self.getLines(): + transcript = Transcript() + transcript.setSqlValues(transcriptLine) + transcriptList.addTranscript(transcript) + return transcriptList + + + def setDefaultTagValue(self, name, value): + super(MySqlTranscriptTable, self).setDefaultTagValue(Transcript.getSqlVariables().index("tags")+1, name, value) \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/mySql/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/.NCList.py.swp Binary file SMART/Java/Python/ncList/.NCList.py.swp has changed diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/.NCListCursor.py.swp Binary file SMART/Java/Python/ncList/.NCListCursor.py.swp has changed diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/Benchmark.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/Benchmark.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,357 @@ +import os, os.path, random, glob, subprocess, threading, time, resource +from optparse import OptionParser +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.GffParser import GffParser + +#TYPES = ("bin", "has", "seg", "fj", "nc", "new") +TYPES = ("new", ) + +class RunCmd(threading.Thread): + def __init__(self, cmd, out, err, time, memory): + threading.Thread.__init__(self) + self._cmd = cmd + self._out = out + self._err = err + self._time = time + self._memory = memory + self._id = os.getpid() + self._mem = 0.0 + self._outputFileName = "tmp_%d.out" % (self._id) + + def run(self): + self._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True) + #self._p.wait() + + def _runShellCommand(self, command): + p = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True) + handle = open(self._outputFileName) + data = [line.split() for line in handle.readlines()[1:] if line] + handle.close() + os.remove(self._outputFileName) + return data + + def _getPid(self): + self._pid = None + cpt = 1 + while True: + commandsFound = [] + for line in self._runShellCommand("ps -o pid,cmd"): + if line[1:] == self._cmd.split(" "): + self._pid = int(line[0]) + commandsFound.append(" ".join(line[1:])) + if self._pid != None: + return True + time.sleep(1) + if cpt % 100 == 0: + print "pid of '%s' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound)) + cpt += 1 + if cpt > 300: + return False + + def _fetchMemory(self): + lines = self._runShellCommand("ps u -p %d" % (self._pid)) + for line in lines: + self._mem = max(self._mem, float(line[3])) + return self._mem >= self._memory + #print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines])) + return False + + def getMemory(self): + return self._mem + + def _abort(self): + try: + self._p.terminate() + except Exception: + pass + self._killSubThreads() + + def _killSubThreads(self): + for line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)): + self._runShellCommand("kill %s" % (line[0])) + self._runShellCommand("kill %s" % (self._pid)) + + def go(self): + startTime = time.time() + self.run() + #self.start() + while not self._getPid(): + #self.start() + self.run() + while True: + if self._time != None and time.time() - startTime > self._time: + print "\nCommand '%s' did not finish in time. Aborting it." % (self._cmd) + self._abort() + break + if self._memory != None and self._fetchMemory(): + print "\nCommand '%s' required too much memory (%f). Aborting it." % (self._cmd, self._mem) + self._abort() + break + #self.join(0.1) + time.sleep(0.1) + #if not self.isAlive(): + if self._p.poll() != None: + return True + return False + + +class DataStructure(object): + def __init__(self): + self._structure = {} + + def addData(self, data): + if data._nbRefs not in self._structure: + self._structure[data._nbRefs] = {} + if data._nbQueries not in self._structure[data._nbRefs]: + self._structure[data._nbRefs][data._nbQueries] = {} + if data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]: + self._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {} + if data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]: + self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = [] + self._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group) + + def export(self): + outputString = "#refs\t#queries\tgenome size\ttype\t# written\t# overlaps\tbuild t.\trun t.\tmem\n" + for nbRefs in sorted(self._structure.keys()): + for nbQueries in sorted(self._structure[nbRefs].keys()): + for genomeSize in sorted(self._structure[nbRefs][nbQueries].keys()): + for type in TYPES: + if type not in self._structure[nbRefs][nbQueries][genomeSize]: + outputString += "NA\tNA\tNA\t%s\tNA\tNA\tNA\tNA\tNA\tNA\n" % (type) + else: + for group in self._structure[nbRefs][nbQueries][genomeSize][type]: + outputString += "%d\t%d\t%d\t%s\t%d\t%d\t%f\t%f\t%f\n" % (nbRefs, nbQueries, genomeSize, type, group._nbOutputs, group._nbOverlaps, group._buildTime, group._runTime, group._mem) + return outputString + + +class Data(object): + def __init__(self, type, buildTime, runTime, mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize): + self._type = type + self._nbRefs = nbRefs + self._nbQueries = nbQueries + self._genomeSize = genomeSize + self._group = Group(nbOutputs, nbOverlaps, buildTime, runTime, mem) + + def checkConsistency(self, data): + return self._group.checkConsistency(data._group) + + +class Group(object): + def __init__(self, nbOutputs, nbOverlaps, buildTime, runTime, mem): + self._buildTime = buildTime + self._runTime = runTime + self._mem = mem + self._nbOutputs = nbOutputs + self._nbOverlaps = nbOverlaps + + def checkConsistency(self, group): + if (self._buildTime == "NA" or group._buildTime == "NA"): + return True + return (self._nbOutputs == group._nbOutputs and self._nbOverlaps == group._nbOverlaps) + + +class Benchmark(object): + + def __init__(self, verbosity = 1): + self._verbosity = verbosity + self._checkEnvironmentVariable() + self._toolName = {"bin": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBin.py"), \ + "has": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsHashBin.py"), \ + "seg": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervalsBinSegment.py"), \ + "fj": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsFJoin.py"), \ + "nc": os.path.join(os.environ["SMARTPATH"], "ncList", "FindOverlapsWithSeveralIntervals.py"), \ + "new": os.path.join(os.environ["SMARTPATH"], "FindOverlapsOptim.py")} + self._structure = DataStructure() + self._pid = os.getpid() + self._count = 0 + self._time = None + self._memory = None + + def _checkEnvironmentVariable(self): + if "SMARTPATH" not in os.environ: + raise Exception("'SMARTPATH' is not set. Please set it to '/S-mart/Java/Python'.") + + def _createTmpFileName(self, name, extension): + self._count += 1 + return "tmp_%d_%d_%s.%s" % (self._pid, self._count, name, extension) + + def _dumpAndReturn(self, fileName, exception): + handle = open(fileName) + print "Error in parsing file '%s':" % (fileName) + for line in handle: + print line.strip() + print "Command is: '%s'" % (self._command) + raise exception + + def setNbReferences(self, nbReferences): + self._nbReferences = nbReferences + + def setNbQueries(self, nbQueries): + self._nbQueries = nbQueries + + def setGenomeSizes(self, nbGenomeSizes): + self._nbGenomeSizes = nbGenomeSizes + + def setNbReplicates(self, nbReplicates): + self._nbReplicates = nbReplicates + + def setChromosomeName(self, chromosome): + self._chromosomeName = chromosome + + def setSizes(self, minSize, maxSize): + self._minSize = minSize + self._maxSize = maxSize + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + + def setLimits(self, time, memory): + self._time = time + self._memory = memory + + def _generateIntervals(self, nbElements, genomeSize): + fileName = self._createTmpFileName("intervals", "gff3") + iRR = RandomRegionsGenerator(0) + iRR.setMinSize(self._minSize) + iRR.setMaxSize(self._maxSize) + iRR.setGenomeSize(genomeSize) + iRR.setChromosomeName(self._chromosomeName) + iRR.setStrands(False) + iRR.setNumber(nbElements) + iRR.setOutputFile(fileName) + iRR.run() + return fileName + + def _startTool(self, type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize): + outputFileName = self._createTmpFileName("output", "gff3") + outFileName = self._createTmpFileName("out", "out") + errFileName = self._createTmpFileName("err", "err") + outHandle = open(outFileName, "w") + errHandle = open(errFileName, "w") + self._command = "python %s -i %s -f gff3 -j %s -g gff3 -o %s -v 3" % (self._toolName[type], queryFileName, refFileName, outputFileName) + thread = RunCmd(self._command, outHandle, errHandle, self._time, self._memory) + over = thread.go() + self._mem = thread.getMemory() + if os.path.exists(outputFileName): + os.remove(outputFileName) + outHandle.close() + errHandle.close() + errData = open(errFileName).readlines() + if errData: + print "Error output: \n%s\n" % ("\n".join(errData)) + if not over: + errHandle = open(errFileName, "r") + error = errHandle.readlines() + errHandle.close() + if error: + for line in error: + print line.strip() + print "Previous process failed" + os.remove(errFileName) + if not over: + return False + return outFileName + + def _parseTrace(self, type, fileName, genomeSize): + handle = open(fileName) + buildTime = 0 + try: + for line in handle: + line = line.strip() + if "time spent" in line: + buildTime += float(line.split()[-1][:-1]) + elif "done" in line: + buildTime += float(line.split("(")[1][:-2]) + elif "# queries" in line: + nbQueries = int(line.split()[-1]) + elif "# refs" in line: + nbRefs = int(line.split()[-1]) + elif "# written" in line: + nbOutputs = int(line.split()[2]) + nbOverlaps = int(line.split()[3][1:]) + elif "time" in line: + runTime = float(line.split()[-1][:-1]) + except Exception, e: + handle.close() + self._dumpAndReturn(fileName, e) + handle.close() + try: + return Data(type, buildTime, runTime, self._mem, nbRefs, nbQueries, nbOutputs, nbOverlaps, genomeSize) + except Exception, e: + handle.close() + self._dumpAndReturn(fileName, e) + + def _cleanTmpFiles(self, really = False): + files = glob.glob("tmp_%d_*.pkl" % (self._pid)) + glob.glob("tmp_%d_*.bin" % (self._pid)) + if really: + files += glob.glob("tmp_%d_*.gff3" % (self._pid)) + glob.glob("tmp_%d_*.out" % (self._pid)) + for fileName in files: + os.remove(fileName) + + def run(self): + progress = Progress(len(self._nbReferences) * len(self._nbQueries) * len(self._nbGenomeSizes) * self._nbReplicates, "Processing", self._verbosity) + for nbReferences in self._nbReferences: + for queriesRatio in self._nbQueries: + nbQueries = int(nbReferences * queriesRatio) + for genomeSizeRatio in self._nbGenomeSizes: + genomeSize = int(nbReferences * genomeSizeRatio) + for replicate in range(self._nbReplicates): + refFileName = self._generateIntervals(nbReferences, genomeSize) + queryFileName = self._generateIntervals(nbQueries, genomeSize) + data = {} + for type in TYPES: + fileName = self._startTool(type, refFileName, queryFileName, nbReferences, nbQueries, genomeSize) + if not fileName: + data[type] = Data(type, "NA", "NA", "NA", nbReferences, nbQueries, "NA", "NA", genomeSize) + else: + data[type] = self._parseTrace(type, fileName, genomeSize) + self._structure.addData(data[type]) + os.remove(fileName) + self._cleanTmpFiles() + self._cleanTmpFiles(True) + firstType = TYPES[0] + for type in TYPES[1:]: + if not data[firstType].checkConsistency(data[type]): + raise Exception("Outputs are not consistent.\n # outputs: %d vs %d.\n # overlaps: %d vs %d.\n %s: %f + %f; %s: %f + %f.\n Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName)) + for fileName in (queryFileName, refFileName): + if os.path.exists(fileName): + os.remove(fileName) + progress.inc() + progress.done() + handle = open(self._outputFileName, "w") + handle.write(self._structure.export()) + handle.close() + + + +if __name__ == "__main__": + + description = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]" + parser = OptionParser(description = description) + parser.add_option("-r", "--nbReferences", dest="nbReferences", action="store", default=None, type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]") + parser.add_option("-q", "--nbQueries", dest="nbQueries", action="store", default=None, type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]") + parser.add_option("-R", "--nbReplicates", dest="nbReplicates", action="store", default=None, type="int", help="number of replicates [compulsory] [format: int]") + parser.add_option("-s", "--genomeSizes", dest="genomeSizes", action="store", default=None, type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]") + parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the reads [compulsory] [format: int]") + parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the reads [compulsory] [format: int]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in TXT format]") + parser.add_option("-t", "--time", dest="time", action="store", default=None, type="int", help="maximum time to wait (in seconds) [default: None] [format: int]") + parser.add_option("-m", "--memory", dest="memory", action="store", default=None, type="float", help="maximum memory usage (in %) [default: None] [format: float]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + benchmark = Benchmark(options.verbosity) + benchmark.setNbReferences(map(int, options.nbReferences.split(","))) + benchmark.setNbQueries(map(float, options.nbQueries.split(","))) + benchmark.setGenomeSizes(map(float, options.genomeSizes.split(","))) + benchmark.setNbReplicates(options.nbReplicates) + benchmark.setChromosomeName(options.chromosome) + benchmark.setSizes(options.minSize, options.maxSize) + benchmark.setLimits(options.time, options.memory) + benchmark.setOutputFileName(options.outputFileName) + benchmark.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/ConvertToNCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/ConvertToNCList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,172 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import random, os, time, shutil +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.ncList.NCListMerger import NCListMerger +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +class ConvertToNCList(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._sortedFileNames = {} + self._inputFileName = None + self._outputFileName = None + self._index = False + self._ncLists = {} + self._splittedFileNames = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._randomNumber = random.randint(0, 10000) + self._sorted = False + self._verbosity = verbosity + + def setInputFileName(self, fileName, format): + self._inputFileName = fileName + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parser = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + fileNameNoExtension = os.path.splitext(fileName)[0] + baseName = "%s_%d" % (fileNameNoExtension, self._randomNumber) + self._directory = "%s_files" % (baseName) + if not os.path.exists(self._directory): + os.makedirs(self._directory) + self._sortedFileNames = os.path.join(self._directory, baseName) + + def setIndex(self, boolean): + self._index = boolean + + def setSorted(self, boolean): + self._sorted = boolean + + def sortFile(self): + if self._verbosity > 2: + print "%s file %s..." % ("Rewriting" if self._sorted else "Sorting", self._inputFileName) + startTime = time.time() + fs = FileSorter(self._parser, self._verbosity-4) + fs.setPresorted(self._sorted) + fs.perChromosome(True) + fs.setOutputFileName(self._sortedFileNames) + fs.sort() + self._splittedFileNames = fs.getOutputFileNames() + self._nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self._nbElements = fs.getNbElements() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def createNCLists(self): + self._ncLists = {} + if self._verbosity > 2: + print "Creating NC-list for %s..." % (self._inputFileName) + startTime = time.time() + for chromosome, fileName in self._splittedFileNames.iteritems(): + if self._verbosity > 3: + print " chromosome %s" % (chromosome) + ncList = NCList(self._verbosity) + if self._index: + ncList.createIndex(True) + ncList.setChromosome(chromosome) + ncList.setFileName(fileName) + ncList.setNbElements(self._nbElementsPerChromosome[chromosome]) + ncList.buildLists() + self._ncLists[chromosome] = ncList + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def writeOutputFile(self): + merger = NCListMerger(self._verbosity) + merger.setFileName(self._outputFileName) + merger.addIndex(self._index) + merger.setNCLists(self._ncLists) + merger.merge() + + def cleanFiles(self): + shutil.rmtree(self._directory) + + def run(self): + self.sortFile() + self.createNCLists() + self.writeOutputFile() + self.cleanFiles() + + def getSortedFileNames(self): + return self._splittedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def getTmpDirectory(self): + return self._directory + + +if __name__ == "__main__": + description = "Convert To NC-List v1.0.0: Convert a mapping or transcript file into a NC-List. [Category: NC-List]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-d", "--index", dest="index", action="store_true", default=False, help="create an index [default: false] [format: boolean]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in NCList format]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input file is already sorted [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + ctncl = ConvertToNCList(options.verbosity) + ctncl.setInputFileName(options.inputFileName, options.format) + ctncl.setOutputFileName(options.outputFileName) + ctncl.setIndex(options.index) + ctncl.setSorted(options.sorted) + ctncl.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/FileSorter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FileSorter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,210 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +try: + import cPickle as pickle +except: + import pickle +import random, os +from heapq import heapify, heappop, heappush +from itertools import islice, cycle +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +BUFFER_SIZE = 100 * 1024 + +class FileSorter(object): + + def __init__(self, parser, verbosity = 1): + self._parser = parser + self._verbosity = verbosity + self._chunks = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._perChromosome = False + self._isPreSorted = False + self._outputFileNames = {} + self._prefix = "tmpFile_%d" % (random.randint(0, 100000)) + self._chromosome = None + if "SMARTTMPPATH" in os.environ: + self._prefix = os.path.join(os.environ["SMARTTMPPATH"], self._prefix) + + def selectChromosome(self, chromosome): + self._chromosome = chromosome + + def perChromosome(self, boolean): + self._perChromosome = boolean + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + if self._perChromosome: + self._outputFileName = os.path.splitext(self._outputFileName)[0] + + def setPresorted(self, presorted): + self._isPreSorted = presorted + + def sort(self): + if not self._isPreSorted: + self._batchSort() + else: + self._presorted() + + def _presorted(self): + progress = UnlimitedProgress(1000, "Writing files %s" % (self._parser.fileName), self._verbosity) + curChromosome = None + outputHandle = None + + if not self._perChromosome: + outputHandle = open(self._outputFileName, "wb") + for transcript in self._parser.getIterator(): + progress.inc() + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + if self._chromosome != None and chromosome != self._chromosome: + continue + self._nbElements += 1 + self._nbElementsPerChromosome[chromosome] = self._nbElementsPerChromosome.get(chromosome, 0) + 1 + if self._perChromosome: + if chromosome != curChromosome: + if outputHandle != None: + outputHandle.close() + self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) + outputHandle = open(self._outputFileNames[chromosome], "wb") + curChromosome = chromosome + outputHandle.writelines("%s" % pickle.dumps(transcript)) + if outputHandle != None: + outputHandle.close() + progress.done() + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def _printSorted(self, chromosome, chunk): + chunk.sort(key = lambda transcript: (transcript.getStart(), -transcript.getEnd())) + outputChunk = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, len(self._chunks[chromosome])), "wb", 32000) + self._chunks[chromosome].append(outputChunk) + for transcript in chunk: + outputChunk.write(pickle.dumps(transcript, -1)) + outputChunk.close() + + def _merge(self, chunks): + values = [] + for chunk in chunks: + chunk = open(chunk.name, "rb") + try: + transcript = pickle.load(chunk) + start = transcript.getStart() + end = -transcript.getEnd() + except EOFError: + try: + chunk.close() + chunks.remove(chunk) + os.remove(chunk.name) + except: + pass + else: + heappush(values, (start, end, transcript, chunk)) + while values: + start, end, transcript, chunk = heappop(values) + yield transcript + try: + transcript = pickle.load(chunk) + start = transcript.getStart() + end = -transcript.getEnd() + except EOFError: + try: + chunk.close() + chunks.remove(chunk) + os.remove(chunk.name) + except: + pass + else: + heappush(values, (start, end, transcript, chunk)) + + def _batchSort(self): + currentChunks = {} + counts = {} + try: + progress = UnlimitedProgress(1000, "Sorting file %s" % (self._parser.fileName), self._verbosity) + for transcript in self._parser.getIterator(): + progress.inc() + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + if self._chromosome != None and chromosome != self._chromosome: + continue + if chromosome not in self._chunks: + self._chunks[chromosome] = [] + currentChunks[chromosome] = [] + counts[chromosome] = 0 + currentChunks[chromosome].append(transcript) + counts[chromosome] += 1 + if counts[chromosome] == BUFFER_SIZE: + self._printSorted(chromosome, currentChunks[chromosome]) + currentChunks[chromosome] = [] + counts[chromosome] = 0 + self._nbElements += 1 + self._nbElementsPerChromosome[chromosome] = self._nbElementsPerChromosome.get(chromosome, 0) + 1 + for chromosome in self._chunks: + if counts[chromosome] > 0: + self._printSorted(chromosome, currentChunks[chromosome]) + progress.done() + if not self._perChromosome: + outputHandle = open(self._outputFileName, "wb") + progress = Progress(len(self._chunks), "Writing sorted file %s" % (self._parser.fileName), self._verbosity) + for chromosome in self._chunks: + if self._perChromosome: + self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) + outputHandle = open(self._outputFileNames[chromosome], "wb") + for sequence in self._merge(self._chunks[chromosome]): + pickle.dump(sequence, outputHandle, -1) + if self._perChromosome: + outputHandle.close() + progress.inc() + if not self._perChromosome: + outputHandle.close() + progress.done() + finally: + for chunks in self._chunks.values(): + for chunk in chunks: + try: + chunk.close() + os.remove(chunk.name) + except Exception: + pass + + def getOutputFileNames(self): + return self._outputFileNames diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,197 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct +import math +import os +from optparse import OptionParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.FileSorter import FileSorter +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.structure.Transcript import Transcript + +LONGSIZE = struct.calcsize('l') + +class FindOverlapsWithOneInterval(object): + + def __init__(self, verbosity): + self._sortedFileName = None + self._verbosity = verbosity + self._overlappingNames = [] + self._nbOverlaps = 0 + self._nbWritten = 0 + + def __del__(self): + if self._sortedFileName and os.path.exists(self._sortedFileName): + os.remove(self._sortedFileName) + + def close(self): + self._iWriter.close() + + def setOutputFileName(self, fileName): + self._iWriter = Gff3Writer(fileName) + + def setFileName(self, fileName, format): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parser = chooser.getParser(fileName) + self._sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setInterval(self, chromosome, start, end): + self._chromosome = chromosome + self._start = start + self._end = end + self._transcript = Transcript() + self._transcript.setChromosome(chromosome) + self._transcript.setStart(start) + self._transcript.setEnd(end) + self._transcript.setDirection("+") + + def setTranscript(self, transcript): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + self._chromosome = transcript.getChromosome() + self._start = transcript.getStart() + self._end = transcript.getEnd() + self._transcript = transcript + + def prepareIntermediateFiles(self): + fs = FileSorter(self._parser, self._verbosity-4) + fs.selectChromosome(self._chromosome) + fs.perChromosome(False) + fs.setOutputFileName(self._sortedFileName) + fs.sort() + self._nbTotalLines = fs.getNbElements() + self._nbLines = fs.getNbElementsPerChromosome()[self._chromosome] + + def createNCList(self): + if self._verbosity > 2: + print "Creating NC-list..." + ncList = NCList(self._verbosity) + ncList.createIndex(True) + ncList.setChromosome(self._chromosome) + ncList.setFileName(self._sortedFileName) + ncList.setNbElements(self._nbTotalLines) + ncList.buildLists() + self.setNCList(ncList, ncList.getIndex()) + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def setNCList(self, ncList, index): + self._ncList = ncList + self._indix = index + + def binarySearch(self, cursor, startL, endL): + if startL > endL: + return None + middleL = (startL + endL) / 2 + cursor.moveSibling(middleL) + overlap = self.isOverlapping(cursor) + if overlap == 0: + if middleL == startL: + return cursor + else: + return self.binarySearch(cursor, startL, middleL) + if overlap == -1: + return self.binarySearch(cursor, middleL + 1, endL) + return self.binarySearch(cursor, startL, middleL - 1) + + def compare(self, cursor = None): + self._ncList.openFiles() + if cursor == None: + dump = True + cursor = NCListCursor(None, self._ncList, 0, self._verbosity) + cursor._getSiblingData() + cursor = self.binarySearch(cursor, cursor._firstSiblingLIndex, cursor._lastSiblingLIndex) + if cursor == None: + return + while not cursor.isOut() and self.isOverlapping(cursor) == 0: + self.write(cursor) + newCursor = NCListCursor(cursor) + if newCursor.hasChildren(): + newCursor.moveDown() + self.compare(newCursor) + if cursor.isLast(): + return + cursor.moveRight() + + def isOverlapping(self, cursor): + if self._end < cursor.getStart(): + return 1 + if self._start > cursor.getEnd(): + return -1 + return 0 + + def write(self, cursor): + self._nbOverlaps += 1 + refTranscript = cursor.getTranscript() + self._overlappingNames.append(refTranscript.getName()) + + def dumpWriter(self): + if (not self._overlappingNames) or self._transcript == None: + return + self._transcript.setTagValue("nbOverlaps", len(self._overlappingNames)) + self._transcript.setTagValue("overlapsWith", "--".join(self._overlappingNames)) + self._iWriter.addTranscript(self._transcript) + self._nbWritten += 1 + self._overlappingNames = [] + + def run(self): + self.prepareIntermediateFiles() + self.createNCList() + self.compare() + self.dumpWriter() + self.close() + if self._verbosity > 0: + print "# refs: %d" % (self._nbLines) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + + +if __name__ == "__main__": + description = "FindOverlapsWithOneInterval: Finds overlaps with one query interval." + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="Format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--start", dest="start", action="store", type="int", help="The start of the query interval [compulsory] [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", type="int", help="The end of the query interval [compulsory] [format: int]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", type="string", help="Chromosome of the query interval [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFOWOI = FindOverlapsWithOneInterval(options.verbosity) + iFOWOI.setFileName(options.inputFileName, options.format) + iFOWOI.setInterval(options.chromosome, options.start, options.end) + iFOWOI.setOutputFileName(options.outputFileName) + iFOWOI.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,182 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os, struct, time +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.FindOverlapsWithOneInterval import FindOverlapsWithOneInterval + +REFERENCE = 0 +QUERY = 1 +TYPETOSTRING = {0: "reference", 1: "query"} + +class FindOverlapsWithSeveralIntervals(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._verbosity = verbosity + self._ncLists = {} + self._sortedRefFileNames = None + self._transQueryFileName = None + self._cursors = {} + self._iFowoi = FindOverlapsWithOneInterval(self._verbosity) + + def __del__(self): + self.close() + for fileName in (self._sortedRefFileNames, self._transQueryFileName): + if os.path.exists(fileName): + os.remove(fileName) + + def close(self): + self._iFowoi.close() + + def setRefFileName(self, fileName, format): + self.setFileName(fileName, format, REFERENCE) + self._sortedRefFileNames = "%s_ref_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setQueryFileName(self, fileName, format): + self.setFileName(fileName, format, QUERY) + self._transQueryFileName = "%s_query_trans.pkl" % (os.path.splitext(fileName)[0]) + + def setFileName(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parsers[type] = chooser.getParser(fileName) + + def setOutputFileName(self, outputFileName): + self._iFowoi.setOutputFileName(outputFileName) + + def _sortRefFile(self): + fs = FileSorter(self._parsers[REFERENCE], self._verbosity-4) + fs.perChromosome(True) + fs.setOutputFileName(self._sortedRefFileNames) + fs.sort() + self._nbLines[REFERENCE] = fs.getNbElements() + self._nbRefLinesPerChromosome = fs.getNbElementsPerChromosome() + self._splittedFileNames = fs.getOutputFileNames() + + def _translateQueryFile(self): + pickler = NCListFilePickle(self._transQueryFileName, self._verbosity) + progress = UnlimitedProgress(1000, "Translating query data", self._verbosity-4) + cpt = 0 + for queryTranscript in self._parsers[QUERY].getIterator(): + pickler.addTranscript(queryTranscript) + progress.inc() + cpt += 1 + progress.done() + self._nbLines[QUERY] = cpt + self._parsers[QUERY] = NCListFileUnpickle(self._transQueryFileName, self._verbosity) + + def prepareIntermediateFiles(self): + self._sortRefFile() + self._translateQueryFile() + + def createNCLists(self): + self._ncLists = {} + self._indices = {} + self._cursors = {} + for chromosome, fileName in self._splittedFileNames.iteritems(): + if self._verbosity > 3: + print " chromosome %s" % (chromosome) + ncList = NCList(self._verbosity) + ncList.createIndex(True) + ncList.setChromosome(chromosome) + ncList.setFileName(fileName) + ncList.setNbElements(self._nbRefLinesPerChromosome[chromosome]) + ncList.buildLists() + self._ncLists[chromosome] = ncList + cursor = NCListCursor(None, ncList, 0, self._verbosity) + self._cursors[chromosome] = cursor + self._indices[chromosome] = ncList.getIndex() + endTime = time.time() + + def compare(self): + progress = Progress(self._nbLines[QUERY], "Comparing data", self._verbosity-3) + startTime = time.time() + for cpt, queryTranscript in enumerate(self._parsers[QUERY].getIterator()): + chromosome = queryTranscript.getChromosome() + if chromosome not in self._ncLists: + continue + self._iFowoi.setNCList(self._ncLists[chromosome], self._indices[chromosome]) + self._iFowoi.setTranscript(queryTranscript) + self._iFowoi.compare() + self._iFowoi.dumpWriter() + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + + def run(self): + startTime = time.time() + if self._verbosity > 2: + print "Creating NC-list..." + self.prepareIntermediateFiles() + self.createNCLists() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%.2gs)" % (endTime - startTime) + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._iFowoi._nbWritten, self._iFowoi._nbOverlaps) + print "time: %.2gs" % (self._timeSpent) + + +if __name__ == "__main__": + description = "FindOverlaps With Several Intervals v1.0.0: Finds overlaps with several query intervals. [Category: Data comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="Reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFWSI = FindOverlapsWithSeveralIntervals(options.verbosity) + iFWSI.setRefFileName(options.inputRefFileName, options.refFormat) + iFWSI.setQueryFileName(options.inputQueryFileName, options.queryFormat) + iFWSI.setOutputFileName(options.outputFileName) + iFWSI.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,204 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, os.path, time, sqlite3 +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +MINBIN = 3 +MAXBIN = 7 + + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class FindOverlapsWithSeveralIntervalsBin(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.randomNumber = random.randint(0, 10000) + self.dbName = "smartdb%d" % (self.randomNumber) + if "SMARTTMPPATH" in os.environ: + self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName) + self.connection = sqlite3.connect(self.dbName) + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + cursor = self.connection.cursor() + cursor.execute("PRAGMA journal_mode = OFF") + cursor.execute("PRAGMA synchronous = 0") + cursor.execute("PRAGMA locking_mode = EXCLUSIVE") + cursor.execute("PRAGMA count_change = OFF") + cursor.execute("PRAGMA temp_store = 2") + + def __del__(self): + cursor = self.connection.cursor() + for tableName in self.tableNames.values(): + cursor.execute("DROP TABLE IF EXISTS %s" % (tableName)) + if os.path.exists(self.dbName): + os.remove(self.dbName) + + def createTable(self, chromosome): + cursor = self.connection.cursor() + tableName = "tmpTable_%s_%d" % (chromosome.replace("-", "_"), self.randomNumber) + cursor.execute("CREATE TABLE %s (start INT, end INT, transcript BLOB, bin INT)" % (tableName)) + cursor.execute("CREATE INDEX index_%s ON %s (bin)" % (tableName, tableName)) + self.tableNames[chromosome] = tableName + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + startTime = time.time() + if self.verbosity > 2: + print "Storing into table" + for transcript in parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + transcriptString = pickle.dumps(transcript) + chromosome = transcript.getChromosome() + if chromosome not in self.tableNames: + self.createTable(chromosome) + start = transcript.getStart() + end = transcript.getEnd() + bin = getBin(start, end) + cursor = self.connection.cursor() + cursor.execute("INSERT INTO %s (start, end, transcript, bin) VALUES (?, ?, ?, ?)" % (self.tableNames[chromosome]), (start, end, sqlite3.Binary(transcriptString), bin)) + self.nbRefs += 1 + self.connection.commit() + endTime = time.time() + if self.verbosity > 2: + print " ...done (%.2gs)" % (endTime - startTime) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + self.nbQueries = self.queryParser.getNbItems() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def compare(self): + progress = Progress(self.nbQueries, "Reading queries", self.verbosity) + startTime = time.time() + for queryTranscript in self.queryParser.getIterator(): + if queryTranscript.__class__.__name__ == "Mapping": + queryTranscript = queryTranscript.getTranscript() + progress.inc() + queryChromosome = queryTranscript.getChromosome() + if queryChromosome not in self.tableNames: + continue + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + bins = getOverlappingBins(queryStart, queryEnd) + commands = [] + for bin in bins: + command = "SELECT * FROM %s WHERE bin " % (self.tableNames[queryChromosome]) + if bin[0] == bin[1]: + command += "= %d" % (bin[0]) + else: + command += "BETWEEN %d AND %d" % (bin[0], bin[1]) + commands.append(command) + command = " UNION ".join(commands) + cursor = self.connection.cursor() + cursor.execute(command) + overlap = False + line = cursor.fetchone() + while line: + refStart, refEnd, refTranscriptString, refBin = line + if refStart <= queryEnd and refEnd >= queryStart: + refTranscript = pickle.loads(str(refTranscriptString)) + if refTranscript.overlapWith(queryTranscript): + overlap = True + self.nbOverlaps += 1 + line = cursor.fetchone() + if overlap: + self.writer.addTranscript(queryTranscript) + self.nbWritten += 1 + progress.done() + endTime = time.time() + self.timeSpent = endTime - startTime + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + print "time: %.2gs" % (self.timeSpent) + + def run(self): + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Find Overlaps With Several Intervals Using Bin v1.0.1: Use MySQL binning to compare intervals. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fowsib = FindOverlapsWithSeveralIntervalsBin(options.verbosity) + fowsib.setQueryFile(options.inputFileName1, options.format1) + fowsib.setReferenceFile(options.inputFileName2, options.format2) + fowsib.setOutputFile(options.outputFileName) + fowsib.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,137 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, time, MySQLdb +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +class FindOverlapsWithSeveralIntervalsIndex(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + randomNumber = random.randint(0, 10000) + self.dbName = "smartdb" + if "SMARTTMPPATH" in os.environ: + self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName) + self.db = MySQLdb.connect(db = self.dbName) + self.tableName = "table_%s" % (randomNumber) + self.nbQueries = 0 + self.nbRefs = 0 + self.nbOverlaps = 0 + + def __del__(self): + cursor = self.db.cursor() + cursor.execute("DROP TABLE IF EXISTS %s" % (self.tableName)) + + + def setReferenceFile(self, fileName, format): + cursor = self.db.cursor() + cursor.execute("CREATE TABLE %s (start INT, end INT)" % (self.tableName)) + cursor.execute("CREATE INDEX index_%s ON %s (start, end)" % (self.tableName, self.tableName)) + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + progress = UnlimitedProgress(1000, "Reading references", self.verbosity) + for transcript in parser.getIterator(): + start = transcript.getStart() + end = transcript.getEnd() + cursor = self.db.cursor() + cursor.execute("INSERT INTO %s (start, end) VALUES (%d, %d)" % (self.tableName, start, end)) + self.nbRefs += 1 + progress.inc() + self.db.commit() + progress.done() + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + self.nbQueries = self.queryParser.getNbTranscripts() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def compare(self): + progress = Progress(self.nbQueries, "Reading queries", self.verbosity) + startTime = time.time() + for queryTranscript in self.queryParser.getIterator(): + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + command = "SELECT 1 FROM %s WHERE start <= %d and end >= %d" % (self.tableName, queryEnd, queryStart) + cursor = self.db.cursor() + cursor.execute(command) + overlap = False + line = cursor.fetchone() + while line: + overlap = True + line = cursor.fetchone() + if overlap: + self.writer.addTranscript(queryTranscript) + self.nbOverlaps += 1 + progress.inc() + progress.done() + endTime = time.time() + self.timeSpent = endTime - startTime + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# overlaps: %d" % (self.nbOverlaps) + print "time: %.2gs" % (self.timeSpent) + + def run(self): + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Find Overlaps With Several Intervals Using Indices v1.0.1: Use MySQL to compare intervals. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fowsii = FindOverlapsWithSeveralIntervalsIndex(options.verbosity) + fowsii.setQueryFile(options.inputFileName1, options.format1) + fowsii.setReferenceFile(options.inputFileName2, options.format2) + fowsii.setOutputFile(options.outputFileName) + fowsii.run() + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/FindOverlaps_naif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlaps_naif.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,85 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import struct +from optparse import OptionParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.Gff3Writer import Gff3Writer + +LONGSIZE = struct.calcsize('l') + +class FindOverlaps_naif(object): + + def __init__(self, inputRefGff3FileName, inputQueryGff3FileName): + self._inputRefGff3FileName = inputRefGff3FileName + self._inputQueryGff3FileName = inputQueryGff3FileName + + def close(self): + self._iGff3Writer.close() + + def setGff3FileName(self, fileName): + self._inputRefGff3FileName = fileName + + def setQueryGff3FileName(self, fileName): + self._inputQueryGff3FileName = fileName + + def setOutputGff3FileName(self, outputGff3FileName): + if outputGff3FileName != '': + self._outputGff3FileName = outputGff3FileName + self._iGff3Writer = Gff3Writer(self._outputGff3FileName) + + def run(self): + queryParser = GffParser(self._inputQueryGff3FileName, 0) + for queryTranscript in queryParser.getIterator(): + ids = [] + refParser = GffParser(self._inputRefGff3FileName, 0) + for refTranscript in refParser.getIterator(): + if queryTranscript.overlapWith(refTranscript): + ids.append(refTranscript.getTagValue('ID')) + if ids: + queryTranscript.setTagValue("nbOverlaps", len(ids)) + queryTranscript.setTagValue("overlapsWith", "--".join(ids)) + self._iGff3Writer.addTranscript(queryTranscript) + +if __name__ == "__main__": + description = "FindOverlapsWithSeveralInterval: Finds overlaps with several query intervals." + + parser = OptionParser(description = description) + parser.add_option("-i", "--inputRef", dest="inputRefGff3FileName", action="store", type="string", help="Reference input file [compulsory] [format: file in gff3 format]") + parser.add_option("-j", "--inputQuery", dest="inputQueryGff3FileName", action="store", type="string", help="Query input file [compulsory] [format: file in gff3 format]") + parser.add_option("-o", "--output", dest="outputGff3FileName", action="store", type="string", help="output file [compulsory] [format: output file in gff3 format]") + (options, args) = parser.parse_args() + + iFON = FindOverlaps_naif(options.inputRefGff3FileName, options.inputQueryGff3FileName) + iFON.setOutputGff3FileName(options.outputGff3FileName) + iFON.run() + iFON.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCIndex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCIndex.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,55 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +from SMART.Java.Python.structure.Transcript import Transcript + +class NCIndex(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._step = 10000 + self._indices = [] + + def setStep(self, step): + self._step = step + + def addTranscript(self, end, index): + binStart = len(self._indices) + binEnd = int(end / self._step) + for bin in range(binStart, binEnd+1): + self._indices.append(index) + + def getIndex(self, transcript): + bin = int(transcript.getStart() / self._step) + if bin >= len(self._indices): + return self._indices[-1] + return self._indices[bin] + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,337 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path +import struct +import shelve +import sys +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCIndex import NCIndex +from SMART.Java.Python.misc.Progress import Progress + +LONG_SIZE = struct.calcsize('l') + +H = 0 +L = 1 +T = 2 +G = 3 + +H_CELL_SIZE = 2 +L_CELL_SIZE = 5 +T_CELL_SIZE = 6 + +START = 0 +END = 1 +ADDRESS = 2 +LIST = 3 +PARENT = 4 +NEW = 5 +LENGTH = 1 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCList(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._subPos = 0 + self._parentPos = 0 + self._nbLines = 0 + self._nbLists = 0 + self._chromosome = None + self._transcriptFileName = None + self._lHandle = None + self._hHandle = None + self._tHandle = None + self._parser = None + self._sizeDict = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE} + self._offsets = {H: 0, L: 0, G: 0} + self._fileNameDict = {} + self._handleDict = {} + self._createIndex = False + self._missingValues = dict([table, {}] for table in self._sizeDict) + self._missingValues[T][LIST] = -1 + self._missingValues[L][LIST] = 0 + self._missingValues[T][NEW] = -1 + + def __del__(self): + for handle in (self._lHandle, self._hHandle): + if handle != None: + handle.close() + + def createIndex(self, boolean): + self._createIndex = boolean + + def setChromosome(self, chromosome): + self._chromosome = chromosome + + def setFileName(self, fileName): + self._transcriptFileName = fileName + self._parser = NCListFileUnpickle(fileName, self._verbosity) + self._setFileNames(fileName) + + def setNbElements(self, nbElements): + self._nbLines = nbElements + + def setOffset(self, fileType, offset): + self._offsets[fileType] = offset + + def _setFileNames(self, fileName): + if self._chromosome != None and fileName != None: + coreName = os.path.splitext(fileName)[0] + if "SMARTTMPPATH" in os.environ: + coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName) + self._hFileName = "%s_H.bin" % (coreName) + self._lFileName = "%s_L.bin" % (coreName) + self._tFileName = "%s_T.bin" % (coreName) + self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName} + + def getSizeFirstList(self): + return self._sizeFirstList + + def _writeSubListIntoH(self, SubListAddr, SubListLength): + self._hHandle.write(pack(SubListAddr)) + self._hHandle.write(pack(SubListLength)) + self._subPos += H_CELL_SIZE + + def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end): + self._lHandle.write(pack(start)) + self._lHandle.write(pack(end)) + self._lHandle.write(pack(readAddr)) + self._lHandle.write(pack(subListAddr)) + self._lHandle.write(pack(parentAddr)) + self._parentPos += L_CELL_SIZE + + def getLLineElements(self, subListLAddr): + if subListLAddr == -1 or subListLAddr == None: + #print "reading bad from L", subListLAddr + return -1, -1, -1, -1, -1 + else: + self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L]) + start = self._lHandle.read(LONG_SIZE) + if len(start) < LONG_SIZE: + #print "reading very bad from L", subListLAddr + return -1, -1, -1, -1, -1 + start = unpack(start) + end = unpack(self._lHandle.read(LONG_SIZE)) + gff3Addr = unpack(self._lHandle.read(LONG_SIZE)) + subListHAddr = unpack(self._lHandle.read(LONG_SIZE)) + parentLAddr = unpack(self._lHandle.read(LONG_SIZE)) + #print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end + return gff3Addr, subListHAddr, parentLAddr, start, end + + def getHLineElements(self, subListHAddr): + self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H]) + subListStartBin = self._hHandle.read(LONG_SIZE) + if len(subListStartBin) < 8 : + #print "reading bad from H" + return -1, -1 + subListStart = unpack(subListStartBin) + subListElementsNb = unpack(self._hHandle.read(LONG_SIZE)) + #print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb + return subListStart, subListElementsNb + + def getRefGffAddr(self, currentRefLAddr): + RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr) + return RefGff3Addr + + def getIntervalFromAdress(self, address): + self._parser.gotoAddress(int(address) + self._offsets[G]) + iTranscrit = self._parser.getNextTranscript() + return iTranscrit + + def removeFiles(self): + return + + def buildLists(self): + if self._createIndex: + self._index = NCIndex(self._verbosity) + self._createTables() + self._labelLists() + self._computeSubStart() + self._computeAbsPosition() + self._cleanFiles() + + def _createTables(self): + self._initLists() + self._createTable(H, self._nbLists) + self._createTable(T, self._nbLines) + self._createTable(L, self._nbLines) + self._fillTables() + + def _initLists(self): + previousTranscript = None + self._nbLists = 1 + progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5) + for transcript in self._parser.getIterator(): + if self._isIncluded(transcript, previousTranscript): + self._nbLists += 1 + previousTranscript = transcript + progress.inc() + progress.done() + + def _isIncluded(self, transcript1, transcript2): + return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd() + + def _createTable(self, name, size): + handle = open(self._fileNameDict[name], "w+b") + progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5) + for i in xrange(self._sizeDict[name] * size): + handle.write(pack(-1)) + progress.inc() + progress.done() + self._handleDict[name] = handle + + def _fillTables(self): + progress = Progress(self._nbLines, "Filling table T", self._verbosity-5) + for i, transcript in enumerate(self._parser.getIterator()): + self._writeValue(T, i, START, transcript.getStart()) + self._writeValue(T, i, END, transcript.getEnd()) + self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress()) + self._writeValue(T, i, PARENT, -1) + self._writeValue(T, i, LIST, -1) + progress.inc() + progress.done() + progress = Progress(self._nbLists, "Filling table H", self._verbosity-5) + for i in xrange(self._nbLists): + self._writeValue(H, i, LENGTH, 0) + progress.inc() + progress.done() + + def _labelLists(self): + progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5) + nextL = 0 + for i in xrange(self._nbLines): + p = i - 1 + start = self._readValue(T, i, START) + end = self._readValue(T, i, END) + while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)): + p = self._readValue(T, p, PARENT) + thisL = self._readValue(T, p, LIST) + if thisL == -1: + #print "entering" + thisL = nextL + nextL += 1 + length = 0 + self._writeValue(T, p, LIST, thisL) + else: + length = self._readValue(H, thisL, LENGTH) + self._writeValue(T, i, PARENT, p) + self._writeValue(H, thisL, LENGTH, length + 1) + progress.inc() + progress.done() + + def _computeSubStart(self): + progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5) + total = 0 + for i in xrange(self._nbLists): + self._writeValue(H, i, START, total) + total += self._readValue(H, i, LENGTH) + self._writeValue(H, i, LENGTH, 0) + progress.inc() + progress.done() + + def _computeAbsPosition(self): + progress = Progress(self._nbLines, "Writing table", self._verbosity-5) + self._sizeFirstList = 0 + for i in xrange(self._nbLines): + s = self._readValue(T, i, START) + e = self._readValue(T, i, END) + a = self._readValue(T, i, ADDRESS) + pt = self._readValue(T, i, PARENT) + h = self._readValue(T, pt, LIST) + pl = self._readValue(T, pt, NEW) + nb = self._readValue(H, h, LENGTH) + l = self._readValue(H, h, START) + nb + self._writeValue(T, i, NEW, l) + self._writeValue(L, l, START, s) + self._writeValue(L, l, END, e) + self._writeValue(L, l, ADDRESS, a) + self._writeValue(L, l, LIST, -1) + self._writeValue(L, l, PARENT, pl) + self._writeValue(H, h, LENGTH, nb+1) + if nb == 0: + #print "adding it" + self._writeValue(L, pl, LIST, h) + if pl == -1: + self._sizeFirstList += 1 + if self._createIndex: + self._index.addTranscript(e, l) + progress.inc() + progress.done() + + def closeFiles(self): + for handle in self._handleDict.values(): + handle.close() + del self._handleDict + self._lHandle = None + self._hHandle = None + self._tHandle = None + self._parser = None + + def openFiles(self): + self._lHandle = open(self._fileNameDict[L], "rb") + self._hHandle = open(self._fileNameDict[H], "rb") + self._handleDict = {H: self._hHandle, L: self._lHandle} + self._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity) + + def _cleanFiles(self): + self.closeFiles() + os.remove(self._fileNameDict[T]) + + def _getPosition(self, table, line, key): + handle = self._handleDict[table] + handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE) + return handle + + def _writeValue(self, table, line, key, value): + #print "writing", table, line, key, "<-", value + if line == -1: + self._missingValues[table][key] = value + return + handle = self._getPosition(table, line, key) + handle.write(pack(value)) + + def _readValue(self, table, line, key): + #print "reading", table, line, key, "->", + if line == -1: + #print self._missingValues[table][key] + return self._missingValues[table][key] + handle = self._getPosition(table, line, key) + r = unpack(handle.read(LONG_SIZE)) + #print r + return r + + def getIndex(self): + return self._index diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCListCursor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListCursor.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,325 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path, struct +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.Progress import Progress + + +class Data(object): + def __init__(self, hIndex, transcript, firstChildLIndex, lastChildLIndex, start, end): + self.hIndex = hIndex + self.transcript = transcript + self.firstChildLIndex = firstChildLIndex + self.lastChildLIndex = lastChildLIndex + self.start = start + self.end = end + +class NCListCursor(object): + + def __init__(self, cursor = None, ncList = None, lIndex = 0, verbosity = 0): + self._verbosity = verbosity + self._mainListData = [] + if cursor: + self.copy(cursor) + else: + self._ncList = ncList + self.setLIndex(lIndex) + + def setLIndex(self, lIndex): + self._lIndex = lIndex + self._start = None + self._end = None + self._hIndex = None + self._gffIndex = None + self._parentGffIndex = None + self._parentLIndex = None + self._parentHIndex = None + self._parentStart = None + self._parentEnd = None + self._transcript = None + self._firstSiblingLIndex = None + self._lastSiblingLIndex = None + self._firstChildLIndex = None + self._lastChildLIndex = None + self._mainListIndex = lIndex if lIndex < self._ncList.getSizeFirstList() else None + + def precompute(self): + self._mainListIndex = 0 + progress = Progress(self._ncList.getSizeFirstList(), "Precomputing data", self._verbosity) + for i in range(self._ncList.getSizeFirstList()): + gffIndex, hIndex, parentLIndex, start, end = self._ncList.getLLineElements(i) + transcript = self._ncList.getIntervalFromAdress(gffIndex) + firstChildLIndex, nbChildren = self._ncList.getHLineElements(hIndex) + lastChildLIndex = -1 if firstChildLIndex == -1 else firstChildLIndex + nbChildren-1 + self._mainListData.append(Data(hIndex, transcript, firstChildLIndex, lastChildLIndex, start, end)) + progress.inc() + progress.done() + + def _updateFromMainListData(self): + if not self._mainListData or self._lIndex >= self._ncList.getSizeFirstList(): + #print "OUT" + return False + if self._mainListIndex >= self._ncList.getSizeFirstList(): + self._hIndex = -1 + data = self._mainListData[self._mainListIndex] + self._hIndex = data.hIndex + self._transcript = data.transcript + self._firstChildLIndex = data.firstChildLIndex + self._lastChildLIndex = data.lastChildLIndex + self._start = data.start + self._end = data.end + return True + + def getLIndex(self): + return self._lIndex + + def _getCurrentData(self): + self._gffIndex, self._hIndex, self._parentLIndex, self._start, self._end = self._ncList.getLLineElements(self._lIndex) + #print "-->", self._lIndex, "-->", self._gffIndex, self._hIndex, self._parentLIndex, self._start, self._end + if self._end == -1: + raise Exception("Error") + + def _getParentData(self): + if self._parentLIndex == None: + self._getCurrentData() + self._parentGffIndex, self._parentHIndex, greatParentLIndex, self._parentStart, self._parentEnd = self._ncList.getLLineElements(self._parentLIndex) + + def _getTranscript(self): + if self._gffIndex == None: + self._getCurrentData() + self._transcript = self._ncList.getIntervalFromAdress(self._gffIndex) + + def _getSiblingData(self): + if self._parentHIndex == None: + self._getParentData() + if self._parentHIndex == -1: + self._firstSiblingLIndex = 0 + self._lastSiblingLIndex = self._ncList.getSizeFirstList() - 1 + else: + self._firstSiblingLIndex, nbSiblings = self._ncList.getHLineElements(self._parentHIndex) + self._lastSiblingLIndex = -1 if self._firstSiblingLIndex == -1 else self._firstSiblingLIndex + nbSiblings-1 + + def _getChildrenData(self): + if self._hIndex == None: + self._getCurrentData() + self._firstChildLIndex, nbChildren = self._ncList.getHLineElements(self._hIndex) + self._lastChildLIndex = -1 if self._firstChildLIndex == -1 else self._firstChildLIndex + nbChildren-1 + + def getGffAddress(self): + if self._gffIndex == None: + self._getCurrentData() + return self._gffIndex + + def getStart(self): + if self._start == None: + self._getCurrentData() + return self._start + + def getEnd(self): + if self._end == None: + self._getCurrentData() + return self._end + + def compare(self, cursor): + return (self._lIndex == cursor._lIndex) + + def getTranscript(self): + if self.isOut(): + return None + if self._transcript == None: + self._getTranscript() + return self._transcript + + def isFirst(self): + #print "is last: ", self._lIndex, self._ncList.getSizeFirstList(), self._lastSiblingLIndex + if self._lIndex < self._ncList.getSizeFirstList() - 1: + return (self._lIndex == 0) + if self._firstSiblingLIndex == None: + self._getSiblingData() + return (self._lIndex == self._firstSiblingLIndex) + + def isLast(self): + #print "is last: ", self._lIndex, self._ncList.getSizeFirstList(), self._lastSiblingLIndex + if self._lIndex < self._ncList.getSizeFirstList() - 1: + return (self._lIndex == self._ncList.getSizeFirstList() - 1) + if self._lastSiblingLIndex == None: + self._getSiblingData() + return (self._lIndex == self._lastSiblingLIndex) + + def moveUp(self): + if self._parentLIndex == None: + self._getCurrentData() + self._lIndex = self._parentLIndex + self._updateFromMainListData() + self._hIndex = self._parentHIndex + self._gffIndex = self._parentGffIndex + self._parentLIndex = None + self._parentHIndex = None + self._parentGffIndex = None + self._transcript = None + self._firstSiblingLIndex = None + self._lastSiblingLIndex = None + self._firstChildLIndex = self._firstChildLIndex + self._lastChildLIndex = self._lastChildLIndex + self._start = self._parentStart + self._end = self._parentEnd + self._parentStart = None + self._parentEnd = None + + def moveRight(self): + if self.isOut(): + return + #print "IN1", self + if self._lIndex < self._ncList.getSizeFirstList() - 1 and self._mainListIndex != None: + self._mainListIndex += 1 + self._updateFromMainListData() + #print "IN2", self + self._lIndex += 1 + self._hIndex = None + self._start = None + self._end = None + self._transcript = None + self._gffIndex = None + self._firstChildLIndex = None + self._lastChildLIndex = None + #print "IN3", self + + def moveNext(self): + while not self.isOut() and self.isLast(): + if self.isTop(): + self._lIndex = -1 + return + self.moveUp() + #print "F1", self + self.moveRight() + #print "F2", self + + def moveMiddleSibling(self): + if self._lIndex < self._ncList.getSizeFirstList() - 1: + self._mainListIndex = (self._ncList.getSizeFirstList() - 1) / 2 + self._updateFromMainListData() + if self._lastSiblingLIndex == None: + self._getSiblingData() + self._lIndex = (self._lastSiblingLIndex + self._firstSiblingLIndex) / 2 + self._hIndex = None + self._start = None + self._end = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + + def moveSibling(self, lIndex): + if self._lIndex < self._ncList.getSizeFirstList() - 1: + self._mainListIndex = lIndex + self._updateFromMainListData() + self._lIndex = lIndex + self._hIndex = None + self._start = None + self._end = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + + def moveLastSibling(self): + if self._lIndex < self._ncList.getSizeFirstList() - 1: + self._mainListIndex = self._ncList.getSizeFirstList() - 1 + self._updateFromMainListData() + if self._lastSiblingLIndex == None: + self._getSiblingData() + self._lIndex = self._lastSiblingLIndex + self._hIndex = None + self._start = None + self._end = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + + def moveDown(self): + if self._firstChildLIndex == None: + self._getChildrenData() + self._parentLIndex = self._lIndex + self._parentHIndex = self._hIndex + self._parentGffIndex = self._gffIndex + self._lIndex = self._firstChildLIndex + self._lastSiblingLIndex = self._lastChildLIndex + self._hIndex = None + self._gffIndex = None + self._transcript = None + self._firstChildLIndex = None + self._lastChildLIndex = None + self._parentStart = self._start + self._parentEnd = self._end + self._start = None + self._end = None + + def isOut(self): + return (self._lIndex == -1) + + def isTop(self): + if self._parentLIndex == None: + self._getCurrentData() + return (self._parentLIndex == -1) + + def hasChildren(self): + if self._hIndex == None: + self._getCurrentData() + if self._hIndex == -1: + return False + if self._firstChildLIndex == None: + self._getChildrenData() + return (self._firstChildLIndex != -1) + + def copy(self, cursor): + self._ncList = cursor._ncList + self._lIndex = cursor._lIndex + self._hIndex = cursor._hIndex + self._gffIndex = cursor._gffIndex + self._parentLIndex = cursor._parentLIndex + self._parentHIndex = cursor._parentHIndex + self._parentGffIndex = cursor._parentGffIndex + self._transcript = cursor._transcript + self._firstSiblingLIndex = cursor._firstSiblingLIndex + self._lastSiblingLIndex = cursor._lastSiblingLIndex + self._firstChildLIndex = cursor._firstChildLIndex + self._lastChildLIndex = cursor._lastChildLIndex + self._mainListData = cursor._mainListData + self._mainListIndex = cursor._mainListIndex + self._verbosity = cursor._verbosity + self._parentStart = cursor._parentStart + self._parentEnd = cursor._parentEnd + self._start = cursor._start + self._end = cursor._end + + def __str__(self): + return "NC-list: %s, Lindex: %s, Hindex: %s, GFFindex: %s, start: %s, end: %s, parent Lindex: %s, parent Hindex: %s, parent GFFindex: %s, transcript: %s, last sibling: %s" % (self._ncList, self._lIndex, self._hIndex, self._gffIndex, self._start, self._end, self._parentLIndex, self._parentHIndex, self._parentGffIndex, self._transcript, self._lastSiblingLIndex) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCListFilePickle.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListFilePickle.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,123 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.structure.Transcript import Transcript + + +class NCListFilePickle(object): + + def __init__(self, fileName, verbosity = 1): + self.fileName = fileName + self.handle = open(fileName, "wb") + self.verbosity = verbosity + + def __del__(self): + if self.handle != None: + self.handle.close() + + def addTranscript(self, transcript): + pickle.dump(transcript, self.handle, -1) + + def write(self): + pass + + def close(self): + self.__del__() + + +class NCListFileUnpickle(object): + + def __init__(self, fileName, verbosity = 1): + self.handle = open(fileName, "rb") + self.verbosity = verbosity + self.initAddress = 0 + self.address = self.initAddress + self.nbTranscripts = None + self.fileName = fileName + self.over = False + self.chromosome = None + + def __del__(self): + if self.handle != None: + self.handle.close() + + def reset(self): + self.handle.seek(0) + self.initAddress = 0 + + def setChromosome(self, chromosome): + self.chromosome = chromosome + + def getNbTranscripts(self): + if self.nbTranscripts != None: + return self._nbTranscripts + self.nbTranscripts = 0 + for transcript in self.getIterator(): + self_nbTranscripts += 1 + return self.nbTranscripts + + def gotoAddress(self, address): + self.handle.seek(address) + self.address = address + + def getNextTranscript(self): + self.address = self.handle.tell() + try: + transcript = pickle.load(self.handle) + if self.chromosome != None and transcript.getChromosome() != self.chromosome: + self.over = True + return False + return transcript + except EOFError: + self.over = True + return False + + def getIterator(self): + self.gotoAddress(self.initAddress) + while True: + transcript = self.getNextTranscript() + if not transcript: + self.over = True + return + yield transcript + + def setInitAddress(self, address): + self.initAddress = address + + def getCurrentTranscriptAddress(self): + return self.address + + def isOver(self): + return self.over diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCListHandler.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListHandler.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,125 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCIndex import NCIndex +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle + +LONG_SIZE = struct.calcsize('l') + +INFO_PER_NCLIST = 5 +H_FILE = 0 +L_FILE = 1 +G_FILE = 2 +FIRST_LIST_SIZE = 3 +INDEX = 4 + +H = 0 +L = 1 +T = 2 +G = 3 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCListHandler(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._index = False + + def setFileName(self, fileName): + self._fileName = fileName + self._handle = open(fileName, "rb") + + def loadData(self): + self._chromosomes = pickle.load(self._handle) + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._ncLists = {} + for chromosome in self._chromosomes: + self._nbElementsPerChromosome[chromosome] = unpack(self._handle.read(LONG_SIZE)) + self._nbElements += self._nbElementsPerChromosome[chromosome] + self._headerPos = self._handle.tell() + for i, chromosome in enumerate(self._chromosomes): + ncList = NCList(self._verbosity) + ncList._hHandle = self._handle + ncList._lHandle = self._handle + ncList._parser = NCListFileUnpickle(self._fileName) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + H_FILE * LONG_SIZE) + ncList.setOffset(H, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + L_FILE * LONG_SIZE) + ncList.setOffset(L, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + G_FILE * LONG_SIZE) + ncList.setOffset(G, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + FIRST_LIST_SIZE * LONG_SIZE) + ncList._sizeFirstList = unpack(self._handle.read(LONG_SIZE)) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + INDEX * LONG_SIZE) + indices = unpack(self._handle.read(LONG_SIZE)) + if indices != -1: + self._handle.seek(indices) + data = pickle.load(self._handle) + index = NCIndex(self._verbosity) + index._indices = data + ncList._index = index + self._ncLists[chromosome] = ncList + + def getChromosomes(self): + return self._chromosomes + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def getParser(self, chromosome = None): + parser = NCListFileUnpickle(self._fileName) + if chromosome == None: + parser.setInitAddress(unpack(self._handle, self._headerPos + G_FILE * LONG_SIZE)) + return parser + i = self._chromosomes.index(chromosome) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + G_FILE * LONG_SIZE) + pos = unpack(self._handle.read(LONG_SIZE)) + parser.setInitAddress(pos) + parser.setChromosome(chromosome) + return parser diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCListMerger.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListMerger.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,126 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct, os, shutil +try: + import cPickle as pickle +except: + import pickle + +LONG_SIZE = struct.calcsize('l') + +INFO_PER_NCLIST = 5 +H_FILE = 0 +L_FILE = 1 +G_FILE = 2 +FIRST_LIST_SIZE = 3 +INDEX = 4 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCListMerger(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._index = False + + def setFileName(self, fileName): + self._handle = open(fileName, "wb") + + def setNCLists(self, ncLists): + self._ncLists = ncLists + self._chromosomes = sorted(self._ncLists.keys()) + + def addIndex(self, boolean): + self._index = boolean + + def merge(self): + self._writeHeader() + self._addNCLists() + self._handle.close() + self._removeInputFiles() + + def _writeHeader(self): + pickle.dump(self._chromosomes, self._handle, -1) + for chromosome in self._chromosomes: + self._handle.write(pack(self._ncLists[chromosome]._nbLines)) + self._headerPos = self._handle.tell() + for chromosome in self._chromosomes: + for i in range(INFO_PER_NCLIST): + self._handle.write(pack(-1)) + + def _addInHeader(self, i, info, value = None): + currentPos = self._handle.tell() + if value == None: + value = currentPos + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + info * LONG_SIZE) + self._handle.write(pack(value)) + self._handle.seek(currentPos) + + def _addNCLists(self): + self._inputFileNames = [] + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, H_FILE) + hFile = open(ncList._hFileName) + shutil.copyfileobj(hFile, self._handle) + hFile.close() + self._inputFileNames.append(ncList._hFileName) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, L_FILE) + lFile = open(ncList._lFileName) + shutil.copyfileobj(lFile, self._handle) + lFile.close() + self._inputFileNames.append(ncList._lFileName) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, FIRST_LIST_SIZE, ncList.getSizeFirstList()) + if self._index: + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, INDEX) + pickle.dump(ncList.getIndex()._indices, self._handle, -1) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, G_FILE) + tFile = open(ncList._transcriptFileName) + shutil.copyfileobj(tFile, self._handle) + tFile.close() + self._inputFileNames.append(ncList._transcriptFileName) + + def _removeInputFiles(self): + for fileName in self._inputFileNames: + os.remove(fileName) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/NCListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,74 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import random, os, time +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +try: + import cPickle as pickle +except: + import pickle + +class NCListParser(object): + + def __init__(self, fileName, verbosity = 1): + self._fileName = fileName + self._ncLists = {} + self._sortedFileNames = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._verbosity = verbosity + + def parse(self): + handle = open(self._fileName) + self._sortedFileNames = pickle.load(handle) + self._nbElements = pickle.load(handle) + self._nbElementsPerChromosome = pickle.load(handle) + self._ncLists = pickle.load(handle) + for ncList in self._ncLists.values(): + ncList._reopenFiles() + handle.close() + + def getSortedFileNames(self): + return self._sortedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/ncList/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/plot.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plot.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,227 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +""" +Plot the data from the data files +""" + +import os, re, math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from commons.core.utils.FileUtils import FileUtils + +class Plot(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.keep = False + + def keepTmpFiles(self, boolean): + self.keep = boolean + + def setShape(self, shape): + self.shape = shape + + def setInputFileName(self, fileName, format): + self.parser = TranscriptContainer(fileName, format, self.verbosity) + + def setXData(self, tag, default): + self.x = tag + self.xDefault = default + + def setYData(self, tag, default): + self.y = tag + self.yDefault = default + + def setZData(self, tag, default): + self.z = tag + self.zDefault = default + + def setNbBars(self, nbBars): + self.nbBars = nbBars + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setRegression(self, regression): + self.regression = regression + + def setLog(self, log): + self.log = log + + def createPlotter(self): + self.plotter = RPlotter(self.outputFileName, self.verbosity, self.keep) + if self.shape == "barplot": + self.plotter.setBarplot(True) + elif self.shape == "line": + pass + elif self.shape == "points": + self.plotter.setPoints(True) + elif self.shape == "heatPoints": + self.plotter.setHeatPoints(True) + else: + raise Exception("Do not understand shape '%s'\n" % (self.shape)) + + self.plotter.setLog(self.log) + self.plotter.setRegression(self.regression) + + def getValues(self, transcript): + x = transcript.getTagValue(self.x) + y = None + z = None + if self.y != None: + y = transcript.getTagValue(self.y) + if self.z != None: + z = transcript.getTagValue(self.z) + if x == None: + if self.xDefault != None: + x = self.xDefault + else: + raise Exception("Error! Transcript %s do not have the x-tag %s\n" % (transcript, self.x)) + if self.y != None: + if y == None: + if self.yDefault != None: + y = self.yDefault + else: + raise Exception("Error! Transcript %s do not have the y-tag %s\n" % (transcript, self.y)) + if self.z != None: + if z == None: + if self.zDefault != None: + z = self.zDefault + else: + raise Exception("Error! Transcript %s do not have the z-tag %s\n" % (transcript, self.z)) + x = float(x) + if self.y != None: + y = float(y) + if self.z != None: + z = float(z) + return (x, y, z) + + def correctPointsToBarplot(self, line): + minValue = int(math.floor(min(line.keys()))) + maxValue = int(math.ceil(max(line.keys()))) + step = (maxValue - minValue) / self.nbBars + values = dict([i * step + minValue, 0] for i in range(0, self.nbBars)) + top = (self.nbBars - 1) * step + minValue + for key, value in line.iteritems(): + divisor = float(maxValue - minValue) * self.nbBars + tmpMinValue = top + if divisor != 0: + tmpMinValue = min(top, int(math.floor((key - minValue) / divisor))) + newKey = tmpMinValue * step + minValue + values[newKey] += value + return values + + def parseFile(self): + line = {} + heatLine = {} + + cpt = 1 + for transcript in self.parser.getIterator(): + x, y, z = self.getValues(transcript) + name = transcript.name + if name == "unnamed transcript": + name = "transcript %d" % (cpt) + cpt += 1 + if self.shape in ("points", "heatPoints"): + line[name] = (x, y) + if self.shape == "heatPoints": + heatLine[name] = z + if self.shape == "line": + line[x] = y + if self.shape == "barplot": + line[x] = line.get(x, 0) + 1 + if self.shape == "barplot": + line = self.correctPointsToBarplot(line) + self.plotter.setXLabel(self.x) + if self.y != None: + self.plotter.setYLabel(self.y) + else: + self.plotter.setYLabel("Count") + self.plotter.addLine(line) + if self.shape == "heatPoints": + self.plotter.addHeatLine(heatLine) + self.plotter.plot() + + def close(self): + if self.regression: + print self.plotter.getCorrelationData() + if self.shape == "points": + rho = self.plotter.getSpearmanRho() + if rho == None: + print "Cannot compute Spearman rho." + else: + print "Spearman rho: %f" % (rho) + + def run(self): + self.createPlotter() + self.parseFile() + self.close() + + +if __name__ == "__main__": + + # parse command line + description = "Plot v1.0.2: Plot some information from a list of transcripts. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-x", "--x", dest="x", action="store", type="string", help="tag for the x value [format: string]") + parser.add_option("-y", "--y", dest="y", action="store", type="string", help="tag for the y value [format: string]") + parser.add_option("-z", "--z", dest="z", action="store", default=None, type="string", help="tag for the z value [format: string]") + parser.add_option("-X", "--xDefault", dest="xDefault", action="store", default=None, type="float", help="value for x when tag is not present [format: float]") + parser.add_option("-Y", "--yDefault", dest="yDefault", action="store", default=None, type="float", help="value for y when tag is not present [format: float]") + parser.add_option("-Z", "--zDefault", dest="zDefault", action="store", default=None, type="float", help="value for z when tag is not present [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file names [format: output file in PNG format]") + parser.add_option("-s", "--shape", dest="shape", action="store", default="barplot", type="string", help="shape of the plot [format: choice (barplot, line, points, heatPoints)]") + parser.add_option("-n", "--nbBars", dest="nbBars", action="store", default=2, type="int", help="number of bars in barplot [format: int]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: bool]") + parser.add_option("-r", "--regression", dest="regression", action="store_true", default=False, help="plot regression line (in 'points' format) [format: bool]") + parser.add_option("-l", "--log", dest="log", action="store", default="y", type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + plot = Plot(options.verbosity) + plot.setInputFileName(options.inputFileName, options.format) + plot.setOutputFileName(options.outputFileName) + plot.setXData(options.x, options.xDefault) + plot.setYData(options.y, options.yDefault) + plot.setZData(options.z, options.zDefault) + plot.setShape(options.shape) + plot.setNbBars(options.nbBars) + plot.setRegression(options.regression) + plot.setLog(options.log) + plot.keepTmpFiles(options.keep) + plot.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/plotCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotCoverage.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,481 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path, subprocess, glob, random +from optparse import OptionParser +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from commons.core.parsing.FastaParser import FastaParser + +strands = [-1, 1] +colors = {-1: "blue", 1: "red", 0: "black"} +colorLine = "black" + +def parseTargetField(field): + strand = "+" + splittedFieldSpace = field.split() + splittedFieldPlus = field.split("+", 4) + if len(splittedFieldSpace) == 3: + id, start, end = splittedFieldSpace + elif len(splittedFieldSpace) == 4: + id, start, end, strand = splittedFieldSpace + elif len(splittedFieldPlus) == 3: + id, start, end = splittedFieldPlus + elif len(splittedFieldPlus) == 4: + id, start, end, strand = splittedFieldPlus + else: + raise Exception("Cannot parse Target field '%s'." % (field)) + return (id, int(start), int(end), strand) + + +class SimpleTranscript(object): + def __init__(self, transcript1, transcript2, color = None): + self.start = max(0, transcript1.getStart() - transcript2.getStart()) + self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart()) + self.strand = transcript1.getDirection() * transcript2.getDirection() + self.exons = [] + for exon in transcript1.getExons(): + if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd(): + start = max(0, exon.getStart() - transcript2.getStart()) + end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart()) + self.addExon(start, end, self.strand, color) + + def addExon(self, start, end, strand, color): + exon = SimpleExon(start, end, strand, color) + self.exons.append(exon) + + def getRScript(self, yOffset, height): + rString = "" + previousEnd = None + for exon in sorted(self.exons, key=lambda exon: exon.start): + if previousEnd != None: + rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine) + rString += exon.getRScript(yOffset, height) + previousEnd = exon.end + return rString + + +class SimpleExon(object): + def __init__(self, start, end, strand, color = None): + self.start = start + self.end = end + self.strand = strand + self.color = color + + def getRScript(self, yOffset, height): + color = self.color if self.color != None else colors[self.strand] + return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine) + + +class Plotter(object): + + def __init__(self, seed, index, verbosity): + self.seed = seed + self.index = index + self.verbosity = verbosity + self.maxCoverage = 0 + self.maxOverlap = 0 + self.log = "" + self.merge = False + self.width = 1500 + self.heigth = 1000 + self.xLabel = "" + self.yLabel = "" + self.title = None + self.absPath = os.getcwd() + self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index) + self.coverageScript = "" + self.overlapScript = "" + self.outputFileName = None + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + + def setTranscript(self, transcript): + self.transcript = transcript + self.name = transcript.getName() + self.size = transcript.getEnd() - transcript.getStart() + 1 + if self.title == None: + self.title = self.name + else: + self.title += " " + self.name + + def setTitle(self, title): + self.title = title + " " + self.name + + def setPlotSize(self, width, height): + self.width = width + self.height = height + + def setLabels(self, xLabel, yLabel): + self.xLabel = xLabel + self.yLabel = yLabel + + def setMerge(self, merge): + self.merge = merge + + def setCoverageData(self, coverage): + outputCoveragePerStrand = dict([strand, 0] for strand in strands) + outputCoverage = 0 + dataFile = open(os.path.abspath(self.coverageDataFileName), "w") + for position in range(self.size+1): + sumValue = 0 + found = False + dataFile.write("%d\t" % (position)) + for strand in strands: + value = coverage[strand].get(position, 0) + sumValue += value + dataFile.write("%d\t" % (value)) + if value > 0: + found = True + outputCoveragePerStrand[strand] += 1 + self.maxCoverage = max(self.maxCoverage, sumValue) + dataFile.write("%d\n" % (sumValue)) + if found: + outputCoverage += 1 + dataFile.close() + self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100) + self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName)) + self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1]) + self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1]) + self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0]) + + def setOverlapData(self, overlap): + height = 1 + self.maxOverlap = (len(overlap) + 1) * height + thisElement = SimpleTranscript(self.transcript, self.transcript, "black") + self.overlapScript += thisElement.getRScript(0, height) + for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)): + self.overlapScript += transcript.getRScript((cpt + 1) * height, height) + + def getFirstLine(self, suffix = None): + return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height) + + def getLastLine(self): + return "dev.off()\n" + + def startR(self, fileName, script): + scriptFile = open(fileName, "w") + scriptFile.write(script) + scriptFile.close() + command = "R CMD BATCH %s" % (fileName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status)) + + def plot(self): + if self.merge: + fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title) + script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine() + self.startR(fileName, script) + else: + fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title) + script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine() + self.startR(fileName, script) + fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title) + script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine() + self.startR(fileName, script) + + +class PlotParser(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.parsers = [None, None] + self.sequenceParser = None + self.seed = random.randint(0, 10000) + self.title = "" + self.merge = False + + def __del__(self): + for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)): + os.remove(fileName) + for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))): + os.remove(fileName) + for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))): + os.remove(fileName) + + def addInput(self, inputNb, fileName, fileFormat): + if fileName == None: + return + chooser = ParserChooser(self.verbosity) + chooser.findFormat(fileFormat) + self.parsers[inputNb] = chooser.getParser(fileName) + if inputNb == 0: + self.parsers[1] = self.parsers[0] + + def addSequence(self, fileName): + if fileName == None: + return + self.sequenceParser = FastaParser(fileName, self.verbosity) + + def setOutput(self, fileName): + self.outputFileName = fileName + + def setPlotSize(self, width, height): + self.width = width + self.height = height + + def setLabels(self, xLabel, yLabel): + self.xLabel = xLabel + self.yLabel = yLabel + + def setTitle(self, title): + self.title = title + + def setMerge(self, merge): + self.merge = merge + + def initializeDataFromSequences(self): + self.sizes = {} + self.coverage = {} + self.overlap = {} + for region in self.sequenceParser.getRegions(): + self.sizes[region] = self.sequenceParser.getSizeOfRegion(region) + self.coverage[region] = {} + self.overlap[region] = [] + for strand in strands: + self.coverage[region][strand] = {} + self.coverage[region][strand][1] = 0 + self.coverage[region][strand][self.sizes[region]] = 0 + + def initializeDataFromTranscripts(self): + self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) + self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) + self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts())) + progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity) + for cpt, transcript in enumerate(self.parsers[1].getIterator()): + self.coverage[cpt] = {} + self.overlap[cpt] = [] + for strand in strands: + self.coverage[cpt][strand] = {} + self.coverage[cpt][strand][0] = 0 + self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0 + for exon in transcript.getExons(): + self.sizes[cpt] += exon.getSize() + progress.inc() + progress.done() + + def initialize(self): + if self.sequenceParser == None: + self.initializeDataFromTranscripts() + else: + self.initializeDataFromSequences() + + def computeCoverage(self, transcript1, transcript2, id): + strand = transcript1.getDirection() * transcript2.getDirection() + for exon1 in transcript1.getExons(): + for exon2 in transcript2.getExons(): + if exon1.overlapWith(exon2): + for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1): + relativePosition = position - transcript2.getStart() + 1 + self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1 + + def computeOverlap(self, transcript1, transcript2, id): + simpleTranscript = SimpleTranscript(transcript1, transcript2) + self.overlap[id].append(simpleTranscript) + + def compute2TranscriptFiles(self): + progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) + for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): + for transcript1 in self.parsers[0].getIterator(): + if transcript1.overlapWithExon(transcript2): + self.computeCoverage(transcript1, transcript2, cpt2) + self.computeOverlap(transcript1, transcript2, cpt2) + progress.inc() + progress.done() + + def extractReferenceQueryMapping(self, mapping): + queryTranscript = mapping.getTranscript() + referenceTranscript = Transcript() + referenceTranscript.setChromosome(queryTranscript.getChromosome()) + referenceTranscript.setName(queryTranscript.getChromosome()) + referenceTranscript.setDirection("+") + referenceTranscript.setEnd(self.sizes[queryTranscript.getChromosome()]) + referenceTranscript.setStart(1) + return (referenceTranscript, queryTranscript) + + def extractReferenceQuery(self, inputTranscript): + if "Target" not in inputTranscript.getTagNames(): + raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript)) + id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target")) + if id not in self.sizes: + raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript)) + referenceTranscript = Transcript() + referenceTranscript.setChromosome(id) + referenceTranscript.setName(id) + referenceTranscript.setDirection("+") + referenceTranscript.setEnd(self.sizes[id]) + referenceTranscript.setStart(1) + queryTranscript = Transcript() + queryTranscript.setChromosome(id) + queryTranscript.setName(id) + queryTranscript.setStart(start) + queryTranscript.setEnd(end) + queryTranscript.setDirection(strand) + if inputTranscript.getNbExons() > 1: + factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart()) + for exon in inputTranscript.getExons(): + newExon = Interval() + newExon.setChromosome(id) + newExon.setDirection(strand) + if "Target" in inputTranscript.getTagNames(): + id, start, end, strand = parseTargetField(exon.getTagValue("Target")) + newExon.setStart(start) + newExon.setEnd(end) + else: + newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start) + newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start) + queryTranscript.addExon(newExon) + return (referenceTranscript, queryTranscript) + + def compute1TranscriptFiles(self): + progress = Progress(self.parsers[1].getNbItems(), "Comparing regions", self.verbosity) + for transcript in self.parsers[1].getIterator(): + if transcript.__class__.__name__ == "Mapping": + referenceTranscript, queryTranscript = self.extractReferenceQueryMapping(transcript) + else: + referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript) + self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName()) + self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName()) + progress.inc() + progress.done() + + def compute(self): + if self.sequenceParser == None: + self.compute2TranscriptFiles() + else: + self.compute1TranscriptFiles() + + def plotTranscript(self, index, transcript): + plotter = Plotter(self.seed, index, self.verbosity) + plotter.setOutputFileName(self.outputFileName) + plotter.setTranscript(transcript) + plotter.setTitle(self.title) + plotter.setLabels(self.xLabel, self.yLabel) + plotter.setPlotSize(self.width, self.height) + plotter.setCoverageData(self.coverage[index]) + plotter.setOverlapData(self.overlap[index]) + plotter.setMerge(self.merge) + plotter.plot() + output = plotter.log + return output + + def plot1TranscriptFile(self): + self.outputCoverage = {} + self.outputCoveragePerStrand = {} + output = "" + progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity) + for cpt2, region in enumerate(self.sequenceParser.getRegions()): + transcript = Transcript() + transcript.setName(region) + transcript.setDirection("+") + transcript.setEnd(self.sizes[region]) + transcript.setStart(1) + output += self.plotTranscript(region, transcript) + progress.inc() + progress.done() + if self.verbosity > 0: + print output + + def plot2TranscriptFiles(self): + self.outputCoverage = [0] * self.parsers[1].getNbTranscripts() + self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts() + for cpt in range(self.parsers[1].getNbTranscripts()): + self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands) + progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity) + output = "" + for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): + output += self.plotTranscript(cpt2, transcript2) + progress.inc() + progress.done() + if self.verbosity > 0: + print output + + def plot(self): + if self.sequenceParser == None: + self.plot2TranscriptFiles() + else: + self.plot1TranscriptFile() + + def start(self): + self.initialize() + self.compute() + self.plot() + + +if __name__ == "__main__": + + # parse command line + description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript or mapping format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript or mapping file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") + parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") + parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") + parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") + parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") + parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") + parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") + parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") + parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") + parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + colors[1] = options.plusColor + colors[-1] = options.minusColor + colors[0] = options.sumColor + colorLine = options.lineColor + + pp = PlotParser(options.verbosity) + pp.addInput(0, options.inputFileName1, options.inputFormat1) + pp.addInput(1, options.inputFileName2, options.inputFormat2) + pp.addSequence(options.inputSequence) + pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dir, options.outputFileName)) + pp.setPlotSize(options.width, options.height) + pp.setLabels(options.xLabel, options.yLabel) + pp.setTitle(options.title) + pp.setMerge(options.merge) + pp.start() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/plotGenomeCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotGenomeCoverage.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,132 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Utils import * + + +class GetGenomeCoverage(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.inputContainer = None + self.referenceParser = None + self.outputFileName = None + self.genomeSize = None + self.coverage = {} + self.distribution = {} + + + def setInputFile(self, fileName, format): + self.inputContainer = TranscriptContainer(fileName, format, self.verbosity) + + + def setOutputFile(self, fileName): + self.outputFileName = fileName + + + def setReference(self, fileName): + self.referenceParser = FastaParser(fileName, self.verbosity) + + + def getReferenceSizes(self): + self.genomeSize = 0 + for chromosome in self.referenceParser.getRegions(): + self.genomeSize += self.referenceParser.getSizeOfRegion(chromosome) + + + def getCoverage(self): + progress = Progress(self.inputContainer.getNbTranscripts(), "Reading reads", self.verbosity) + for transcript in self.inputContainer.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coverage: + self.coverage[chromosome] = {} + for exon in transcript.getExons(): + for pos in range(exon.getStart(), exon.getEnd() + 1): + if pos not in self.coverage[chromosome]: + self.coverage[chromosome][pos] = 1 + else: + self.coverage[chromosome][pos] += 1 + progress.inc() + progress.done() + + + def getDistribution(self): + nbNucleotides = sum([len(self.coverage[chromosome].keys()) for chromosome in self.coverage]) + progress = Progress(nbNucleotides, "Building distribution", self.verbosity) + for chromosome in self.coverage: + for num in self.coverage[chromosome].values(): + if num not in self.distribution: + self.distribution[num] = 1 + else: + self.distribution[num] += 1 + progress.inc() + progress.done() + self.distribution[0] = self.genomeSize - nbNucleotides + + + def plotDistribution(self): + plotter = RPlotter(self.outputFileName, self.verbosity) + plotter.setFill(0) + plotter.addLine(self.distribution) + plotter.plot() + print "min/avg/med/max reads per base: %d/%.2f/%.1f/%d" % getMinAvgMedMax(self.distribution) + + + def run(self): + self.getReferenceSizes() + self.getCoverage() + self.getDistribution() + self.plotDistribution() + + +if __name__ == "__main__": + + # parse command line + description = "Plot Genome Coverage v1.0.1: Get the coverage of a genome. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="reads file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-r", "--reference", dest="reference", action="store", type="string", help="sequences file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + getGenomeCoverage = GetGenomeCoverage(options.verbosity) + getGenomeCoverage.setInputFile(options.inputFileName, options.format) + getGenomeCoverage.setOutputFile(options.outputFileName) + getGenomeCoverage.setReference(options.reference) + getGenomeCoverage.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/plotRepartition.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotRepartition.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,128 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import os +from optparse import OptionParser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Plot Repartition v1.0.1: Plot the repartition of different data on a whole genome. (This tool uses 1 input file only, the different values being stored in the tags. See documentation to know more about it.) [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF3 format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-c", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default="png", type="string", help="format of the output file [format: string] [default: png]") + parser.add_option("-r", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-l", "--log", dest="log", action="store", default="", type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + (options, args) = parser.parse_args() + + strands = [1, -1] + strandToString = {1: "+", -1: "-"} + names = [None] if options.names == None else options.names.split(",") + maxs = {} + nbElements = [0 for name in names] + lines = [{} for i in range(len(names))] + if options.colors == None: + colors = [None for i in range(len(names))] + else: + colors = options.colors.split(",") + + parser = GffParser(options.inputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + direction = transcript.getDirection() + start = transcript.getStart() + for i, name in enumerate(names): + if chromosome not in lines[i]: + lines[i][chromosome] = dict([(strand, {}) for strand in strands]) + if chromosome not in maxs: + maxs[chromosome] = transcript.getStart() + else: + maxs[chromosome] = max(maxs[chromosome], start) + if start not in lines[i][chromosome][direction]: + lines[i][chromosome][direction][start] = 0 + thisNbElements = float(transcript.getTagValue(name)) if name != None and name in transcript.getTagNames() else 1 + lines[i][chromosome][direction][start] += thisNbElements * direction + nbElements[i] += thisNbElements + progress.inc() + progress.done() + + if options.normalize: + if options.verbosity >= 10: + print "Normalizing..." + for i, linesPerCondition in enumerate(lines): + for linesPerChromosome in linesPerCondition.values(): + for line in linesPerChromosome.values(): + for key, value in line.iteritems(): + line[key] = value / float(nbElements[i]) * max(nbElements) + if options.verbosity >= 10: + print "... done." + + progress = Progress(len(maxs.keys()), "Plotting", options.verbosity) + for chromosome in maxs: + plot = RPlotter("%s%s.%s" % (options.outputFileName, chromosome.capitalize(), options.format), options.verbosity) + plot.setLog(options.log) + plot.setImageSize(2000, 500) + plot.setFormat(options.format) + if maxs[chromosome] <= 1000: + unit = "nt." + ratio = 1.0 + elif maxs[chromosome] <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plot.setYLabel("# reads") + plot.setLegend(True) + for i, name in enumerate(names): + for strand in strands: + correctedLine = dict([(key / ratio, value) for key, value in lines[i][chromosome][strand].iteritems()]) + if name != None: + name = "%s (%s)" % (name.replace("_", " "), strandToString[strand]) + plot.addLine(correctedLine, None, colors[i]) + plot.plot() + progress.inc() + progress.done() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/plotTranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotTranscriptList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,255 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import sys +import math +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.RPlotter import RPlotter + + +class PlotTranscriptList(object): + + def __init__(self, verbosity = 0): + self.inputFileName = None + self.format = None + self.x = None + self.y = None + self.z = None + self.xDefault = None + self.yDefault = None + self.zDefault = None + self.xLabel = None + self.yLabel = None + self.shape = None + self.bucket = None + self.keep = None + self.log = None + self.verbosity = None + + + def setPlotter(self, outputFileName, keep, log, xLabel, yLabel): + self.plotter = RPlotter(outputFileName, self.verbosity, keep) + if self.shape != "barplot": + self.plotter.setLog(log) + self.plotter.setXLabel(xLabel) + self.plotter.setYLabel(yLabel) + + + def setShape(self, shape): + if self.shape == "line": + pass + elif shape == "barplot": + self.plotter.setBarplot(True) + elif shape == "points": + self.plotter.setPoints(True) + elif shape == "heatPoints": + self.plotter.setHeatPoints(True) + else: + sys.exit("Do not understand shape '%s'" % (shape)) + + + def setInput(self, inputFileName, format): + self.parser = TranscriptContainer(inputFileName, format, self.verbosity) + + + def getValues(self, transcript): + x, y, z = None, None, None + x = transcript.getTagValue(self.x) + if self.y != None: + y = transcript.getTagValue(self.y) + if self.z != None: + z = transcript.getTagValue(self.z) + if x == None: + if self.xDefault != None: + x = self.xDefault + else: + sys.exit("Error! Transcript %s do not have the x-tag %s" % (transcript, self.x)) + if y == None and self.shape != "line" and self.shape != "barplot": + if self.yDefault != None: + y = self.yDefault + else: + sys.exit("Error! Transcript %s do not have the y-tag %s" % (transcript, self.y)) + if self.z != None: + if z == None: + if self.zDefault != None: + z = self.zDefault + else: + sys.exit("Error! Transcript %s do not have the z-tag %s" % (transcript, self.z)) + x = float(x) + if self.y != None: + y = float(y) + if self.z != None: + z = float(z) + return (x, y, z) + + + def readFile(self): + cpt = 1 + line = {} + heatLine = {} + for transcript in self.parser.getIterator(): + x, y, z = self.getValues(transcript) + + name = transcript.name + if name == "unnamed transcript": + name = "transcript %d" % (cpt) + cpt += 1 + if self.shape == "points": + line[name] = (x, y) + elif self.shape == "heatPoints": + line[name] = (x, y) + heatLine[name] = z + elif self.shape == "line" or self.shape == "barplot": + if x not in line: + line[x] = 1 + else: + line[x] += 1 + else: + sys.exit("Do not understand shape '%s'" % (self.shape)) + return line, heatLine + + + def putLineInBuckets(self, line): + tmpLine = line + line = {} + for key, value in tmpLine.iteritems(): + line[int(key / float(self.bucket)) * self.bucket] = value + return line + + + def clusterInBarplot(self, line): + nbZeros = 0 + minValue = min(line.keys()) + maxValue = max(line.keys()) + if self.log != "": + if minValue == 0: + minValue = 1000000000 + for value in line: + if value < minValue: + if value == 0: + nbZeros += 1 + else: + minValue = value + minValue = math.log(minValue) + maxValue = math.log(maxValue) + bucketSize = (maxValue - minValue) / self.bucket + tmpLine = line + line = {} + for i in range(int(self.bucket) + 1): + line[i * bucketSize + minValue] = 0 + for key, value in tmpLine.iteritems(): + if self.log != "" and key != 0: + key = math.log(key) + bucketKey = int((key - minValue) / bucketSize) * bucketSize + minValue + if self.log == "" or key != 0: + line[bucketKey] += value +# if self.log != "": +# tmpLine = line +# line = {} +# for key, value in tmpLine.iteritems(): +# line[math.exp(key)] = value + print "%d zeros have been removed" % (nbZeros) + return line + + + def getSpearmanRho(self): + rho = self.plotter.getSpearmanRho() + if rho == None: + print "Cannot compute Spearman rho." + else: + print "Spearman rho: %f" % (rho) + + + def run(self): + line, heatLine = self.readFile() + + if self.shape == "line" and self.bucket != None: + line = self.putLineInBuckets(line) + if self.shape == "barplot": + line = self.clusterInBarplot(line) + + if self.shape == "points" or self.shape == "barplot" or self.shape == "line": + self.plotter.addLine(line) + elif self.shape == "heatPoints": + self.plotter.addLine(line) + self.plotter.addHeatLine(heatLine) + else: + sys.exit("Do not understand shape '%s'" % (self.shape)) + + self.plotter.plot() + + if self.shape == "points" or self.shape == "heatPoints": + self.getSpearmanRho() + + + +if __name__ == "__main__": + + # parse command line + description = "Plot v1.0.2: Plot some information from a list of transcripts. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input",dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format",dest="format", action="store",type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-x", "--x",dest="x",action="store", type="string", help="tag for the x value [format: string]") + parser.add_option("-y", "--y",dest="y",action="store", type="string", help="tag for the y value [format: string]") + parser.add_option("-z", "--z",dest="z", action="store", default=None,type="string", help="tag for the z value [format: string]") + parser.add_option("-X", "--xDefault",dest="xDefault",action="store", default=None,type="float",help="value for x when tag is not present [format: float]") + parser.add_option("-Y", "--yDefault",dest="yDefault",action="store",default=None,type="float",help="value for y when tag is not present [format: float]") + parser.add_option("-Z", "--zDefault",dest="zDefault", action="store",default=None,type="float",help="value for z when tag is not present [format: float]") + parser.add_option("-n", "--xLabel",dest="xLabel",action="store",default="",type="string", help="label on the x-axis [format: string] [default: ]") + parser.add_option("-m", "--yLabel",dest="yLabel",action="store",default="", type="string", help="label on the y-axis [format: string] [default: ]") + parser.add_option("-o", "--output",dest="outputFileName",action="store",type="string", help="output file names [format: output file in PNG format]") + parser.add_option("-s", "--shape",dest="shape",action="store", type="string", help="shape of the plot [format: choice (barplot, line, points, heatPoints)]") + parser.add_option("-b", "--bucket",dest="bucket",action="store",default=None,type="float",help="bucket size (for the line plot) [format: int] [default: 1]") + parser.add_option("-k", "--keep",dest="keep",action="store_true", default=False, help="keep temporary files [format: bool]") + parser.add_option("-l", "--log",dest="log",action="store",default="",type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string] [default: ]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1, type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + plotTranscriptList = PlotTranscriptList(options.verbosity) + plotTranscriptList.x = options.x + plotTranscriptList.y = options.y + plotTranscriptList.z = options.z + plotTranscriptList.xDefault = options.xDefault + plotTranscriptList.yDefault = options.yDefault + plotTranscriptList.zDefault = options.zDefault + plotTranscriptList.shape = options.shape + plotTranscriptList.bucket = options.bucket + plotTranscriptList.log = options.log + plotTranscriptList.setPlotter(options.outputFileName, options.keep, options.log, options.xLabel, options.yLabel) + plotTranscriptList.setShape(options.shape) + plotTranscriptList.setInput(options.inputFileName, options.format) + plotTranscriptList.run() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/qualToFastq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/qualToFastq.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,87 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import SequenceListParser +from SMART.Java.Python.misc.Progress import Progress + +""" +Transform qual and fasta files to a single fastq file +""" + +if __name__ == "__main__": + + # parse command line + description = "Qual To FastQ v1.0.2: Convert a file in FASTA/Qual format to FastQ format. [Category: Conversion]" + + parser = OptionParser(description = description) + parser.add_option("-f", "--fasta", dest="fastaFileName", action="store", type="string", help="input fasta file [compulsory] [format: file in FASTA format]") + parser.add_option("-q", "--qual", dest="qualFileName", action="store", type="string", help="input qual file [compulsory] [format: file in TXT format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in FASTQ format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fastaFile = open(options.fastaFileName) + qualFile = open(options.qualFileName) + fastqFile = open(options.outputFileName, "w") + + fastaLine = fastaFile.readline().strip() + qualLine = qualFile.readline().strip() + header = None + cpt = 0 + while fastaLine: + if not qualLine: + raise Exception("Qual file is shorter!") + if fastaLine[0] == ">": + header = fastaLine[1:] + if qualLine[0] != ">": + raise Exception("Discrepencies around %s!" % (header)) + fastqFile.write("@%s\n" % (header)) + else: + if qualLine[0] == ">": + raise Exception("Discrepencies around %s!" % (qualLine[1:])) + intQualities = qualLine.split() + if len(intQualities) != len(fastaLine): + raise Exception("Sizes of read and quality diverge in %s!" % (header)) + chrQualities = [chr(min(int(quality), 93) + 33) for quality in intQualities] + fastqFile.write("%s\n+\n%s\n" % (fastaLine, "".join(chrQualities))) + fastaLine = fastaFile.readline().strip() + qualLine = qualFile.readline().strip() + if cpt % 1000 == 0 and options.verbosity > 1: + sys.stdout.write("%d lines read\r" % (cpt)) + sys.stdout.flush() + cpt += 1 + if options.verbosity > 0: + print "%d lines read" % (cpt) + + if qualLine: + raise Exception("Qual file is longer!") + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/removeAllTmpTables.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/removeAllTmpTables.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,64 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove all tmp tables in the MySQL database""" + +import os +import glob +from optparse import OptionParser +from SMART.Java.Python.mySql.MySqlConnection import * + + +if __name__ == "__main__": + + description = "Remove Tables v1.0.2: Remove tables in the local MySQL database. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-t", "--tmp", dest="tmp", action="store_true", default=False, help="Remove temporary tables only [format: bool] [default: false]") + parser.add_option("-f", "--files", dest="files", action="store_false", default=True, help="Do not remove temporary files [format: bool] [default: true]") + (options, args) = parser.parse_args() + + print "Removing temporary databases:" + if options.files: + for tmpFile in glob.glob("smartdb*"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + print "Removing temporary files:" + if options.files: + for tmpFile in glob.glob("tmp*.dat"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + for tmpFile in glob.glob("tmp*.R"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + for tmpFile in glob.glob("tmp*.Rout"): + print " removing %s" % (tmpFile) + os.unlink(tmpFile) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/removeEmptySequences.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/removeEmptySequences.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,135 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Remove empty sequences from a FASTA or FASTQ file +""" + +import os, random +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.parsing.FastqParser import * +from commons.core.writer.FastaWriter import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * + + +class EmptySequenceRemover(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.inputFileName = None + self.parser = None + self.format = None + self.writer = None + self.forbiddenNames = {} + self.removedNames = {} + + + def setInputFileName(self, fileName, format): + self.inputFileName = fileName + self.format = format + if options.format == "fasta": + self.parser = FastaParser(self.inputFileName, self.verbosity) + elif options.format == "fastq": + self.parser = FastqParser(self.inputFileName, self.verbosity) + else: + sys.exit("Do not understand '%s' file format." % (self.format)) + + + def setOutputFileName(self, fileName): + if options.format == "fasta": + self.writer = FastaWriter(fileName, self.verbosity) + elif options.format == "fastq": + self.writer = FastqWriter(fileName, self.verbosity) + + + def parse(self): + progress = Progress(self.parser.getNbSequences(), "Reading sequences in %s" % (options.inputFileName), options.verbosity) + for sequence in self.parser.getIterator(): + name = sequence.name.split("/")[0] + if name not in self.forbiddenNames: + if sequence.sequence == "": + self.removedNames[name] = 1 + else: + self.writer.addSequence(sequence) + progress.inc() + progress.done() + self.writer.write() + + +if __name__ == "__main__": + + # parse command line + description = "Remove Empty Sequences v1.0.2: Remove all the empty sequences in a list. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: sequence file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 (in case of pair end reads) [format: file in sequence format given by -f] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in format given by -f]") + parser.add_option("-p", "--output2", dest="outputFileName2", action="store", default=None, type="string", help="output file 2 (in case of pair end reads) [format: output file in sequence format given by -f] [default: None]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open("%s.log" % options.outputFileName, "w") + + remover = EmptySequenceRemover(options.verbosity) + remover.setInputFileName(options.inputFileName, options.format) + remover.setOutputFileName(options.outputFileName) + remover.parse() + removedNames = remover.removedNames + if options.log: + for name in removedNames: + logHandle.write("Removed '%s' in %s\n" % (name, options.inputFileName)) + nbSequences = remover.parser.getNbSequences() + + newRemovedNames = {} + if options.inputFileName2 != None: + remover = EmptySequenceRemover(options.verbosity) + remover.setInputFileName(options.inputFileName2, options.format) + remover.setOutputFileName(options.outputFileName2) + remover.forbiddenNames = removedNames + remover.parse() + newRemovedNames = remover.removedNames + if options.log: + for name in newRemovedNames: + logHandle.write("Removed '%s' in %s\n" % (name, options.inputFileName2)) + + remover = EmptySequenceRemover(options.verbosity) + remover.setInputFileName(options.inputFileName, options.format) + remover.setOutputFileName(options.outputFileName) + remover.forbiddenNames = newRemovedNames + remover.parse() + + nbRemoved = len(removedNames.keys()) + len(newRemovedNames.keys()) + print "%d over %d sequences are empty (%.2f%%)." % (nbRemoved, nbSequences, float(nbRemoved) / nbSequences * 100) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/removeExonLines.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/removeExonLines.sh Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,2 @@ +#!/bin/bash +sed '/exon/d' $1 diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/repetGffConverter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/repetGffConverter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,71 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a GFF with REPET format to BED format""" + +import os +from optparse import OptionParser +from commons.core.parsing.GffParser import * +from commons.core.writer.BedWriter import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Repet GFF Convert v1.0.1: Convert REPET-flavored GFF to normal GFF. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in GFF3 format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = GffParser(options.inputFileName, options.verbosity) + transcripts = dict() + progress = Progress(parser.getNbTranscripts(), "Analyzing file %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + if transcript.feature.endswith("range"): + transcripts[transcript.name] = transcript + elif transcript.feature.endswith("hsp"): + if transcript.name in transcripts: + transcripts[transcript.name].addExon(transcript) + else: + sys.exit("Transcript %s is not defined\n" % (transcript.name)) + else: + sys.exit("Do not understand feature %s" % (transcript.feature)) + progress.inc() + progress.done() + + writer = BedWriter(options.outputFileName, options.verbosity) + for name in transcripts: + writer.addTranscript(transcripts[name]) + + print "%d transcripts out of %d written (%.2f%%)" % (len(transcripts.keys()), parser.getNbTranscripts(), float(len(transcripts.keys())) / parser.getNbTranscripts() * 100) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/restrictFromNucleotides.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictFromNucleotides.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,78 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove all dirty sequences""" + +import os +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.writer.FastaWriter import * +from commons.core.parsing.FastqParser import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Restrict from nucleotide v1.0.1: Remove the sequences with ambiguous nucleotides. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="inputFileName", action="store", default="fasta", type="string", help="format of the input and output files [compulsory] [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in sequence format given by -f]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + # treat items + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + sys.exit("Do not understand '%s' format." % (options.format)) + nbSequences = parser.getNbSequences() + print "sequences: %d" % (nbSequences) + + progress = Progress(nbSequences, "Analyzing sequences of %s" % (options.inputFileName), options.verbosity) + nbKept = 0 + for sequence in parser.getIterator(): + if not sequence.containsAmbiguousNucleotides(): + writer.addSequence(sequence) + nbKept += 1 + progress.inc() + progress.done() + + print "%d items, %d kept (%.2f%%)" % (nbSequences, nbKept, float(nbKept) / nbSequences * 100) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/restrictFromSize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictFromSize.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,94 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the size distribution of a Fasta / BED file""" + +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.parsing.FastqParser import * +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.TranscriptWriter import * +from commons.core.writer.FastaWriter import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Restrict from Size v1.0.1: Select the elements of a list of sequences or transcripts with a given size. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: sequence or transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript or sequence format given by -f]") + parser.add_option("-m", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]") + parser.add_option("-M", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = TranscriptWriter(options.outputFileName, options.format, options.verbosity) + + + # treat items + nbItems = parser.getNbItems() + progress = Progress(nbItems, "Analyzing sequences of %s" % (options.inputFileName), options.verbosity) + nbKept = 0 + nbRead = 0 + nbClKept = 0 + nbClRead = 0 + for item in parser.getIterator(): + size = item.getSize() + nb = 1 if options.format in ("fasta", "fastq") or "nbElements" not in item.getTagNames() else float(item.getTagValue("nbElements")) + nbRead += nb + nbClRead += 1 + if (options.minSize == None or options.minSize <= size) and (options.maxSize == None or options.maxSize >= size): + writer.addElement(item) + nbKept += nb + nbClKept += 1 + progress.inc() + progress.done() + + writer.write() + + print "%d items, %d kept (%.2f%%)" % (nbRead, nbKept, 0 if nbItems == 0 else float(nbKept) / nbItems * 100) + if nbKept != nbClKept or nbRead != nbClRead: + print "%d clusters, %d kept (%.2f%%)" % (nbClRead, nbClKept, 0 if nbClRead == 0 else float(nbClKept) / nbClRead * 100) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/restrictSequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictSequenceList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,113 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a sequence list with some names""" + +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.WriterChooser import WriterChooser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + +class RestrictSequenceList(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.exclude = False + + def setInputFileName(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setExclusion(self, boolean): + self.exclude = boolean + + def setOutputFileName(self, fileName, format): + chooser = WriterChooser(self.verbosity) + chooser.findFormat(format) + self.writer = chooser.getWriter(fileName) + + def setNamesFileName(self, fileName): + self.namesFileName = fileName + + def _readNames(self): + self.names = [] + handle = open(self.namesFileName) + for name in handle: + self.names.append(name.strip()) + handle.close() + + def _write(self): + nbElements = self.parser.getNbItems() + progress = Progress(nbElements, "Parsing input file", self.verbosity) + nbRead = 0 + nbWritten = 0 + for element in self.parser.getIterator(): + name = element.getName() + nbRead += 1 + if Utils.xor(name in self.names, self.exclude): + self.writer.addElement(element) + nbWritten += 1 + if name in self.names: + self.names.remove(name) + progress.inc() + progress.done() + if self.verbosity > 0: + print "%d read" % (nbRead) + print "%d written (%d%%)" % (nbWritten, 0 if nbRead == 0 else round(float(nbWritten) / nbRead * 100)) + + def run(self): + self._readNames() + self._write() + if self.names: + print "Some names are not present in the file: %s" % ", ".join(self.names) + + + +if __name__ == "__main__": + + description = "Restrict Sequence List v1.0.1: Keep the elements of a list of sequences whose name is mentionned in a given file. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFile", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default="fasta", type="string", help="format of the input and output files [compulsory] [format: sequence file format] [default: fasta]") + parser.add_option("-n", "--name", dest="names", action="store", type="string", help="names of the transcripts [compulsory] [format: file in TXT format]") + parser.add_option("-o", "--output", dest="outputFile", action="store", type="string", help="output file [format: output file in sequence format given by -f]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="output all those whose name is NOT on the list [format: boolean]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rsl = RestrictSequenceList(options.verbosity) + rsl.setInputFileName(options.inputFile, options.format) + rsl.setOutputFileName(options.outputFile, options.format) + rsl.setNamesFileName(options.names) + rsl.setExclusion(options.exclude) + rsl.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/restrictTranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictTranscriptList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,85 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a transcript list with some parameters (regions)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress + +STRAND2DIRECTION = {"+": 1, "-": -1, None: None} + +if __name__ == "__main__": + + # parse command line + description = "Restrict Transcript List v1.0.2: Keep the coordinates which are located in a given position. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format [compulsory] [format: transcript file format]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end [format: int]") + parser.add_option("-t", "--strand", dest="strand", action="store", default=None, type="string", help="strand (+ or -) [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = TranscriptWriter(options.outputFileName, options.format, options.verbosity) + + direction = STRAND2DIRECTION[options.strand] + + nbTranscripts = parser.getNbTranscripts() + progress = Progress(nbTranscripts, "Parsing file %s" % (options.inputFileName), options.verbosity) + + nbTotal = 0 + nbKept = 0 + for transcript in parser.getIterator(): + progress.inc() + nbTotal += 1 + if options.chromosome != None and options.chromosome != transcript.getChromosome(): + continue + if options.start != None and options.start > transcript.getEnd(): + continue + if options.end != None and options.end < transcript.getStart(): + continue + if options.end != None and options.end < transcript.getStart(): + continue + if direction != None and direction != transcript.getDirection(): + continue + nbKept += 1 + writer.addTranscript(transcript) + progress.done() + + writer.write() + + print "%d out of %d are kept (%f%%)" % (nbKept, nbTotal, (float(nbKept) / nbTotal * 100)) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/runRandomJobs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/runRandomJobs.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,46 @@ +import unittest +import os +import time +from optparse import OptionParser +from SMART.Java.Python.ncList.test.MockFindOverlaps_randomExample import MockFindOverlaps_randomExample +from SMART.Java.Python.FindOverlapsOptim import FindOverlapsOptim + +if __name__ == '__main__': + description = "runRandomJobs: create random ref/query files (with size given), and run the jobs on cluster with help of runJobs.sh" + + parser = OptionParser(description = description) + parser.add_option("-i", "--inputRef", dest="inputRefGff3FileName", action="store", type="string", help="Reference input file [compulsory] [format: file in gff3 format]") + parser.add_option("-j", "--inputQuery", dest="inputQueryGff3FileName", action="store", type="string", help="Query input file [compulsory] [format: file in gff3 format]") + parser.add_option("-m", "--inputRefSize", dest="numberOfRefReads", action="store", type="int", help="The number of Reference") + parser.add_option("-n", "--inputQuerySize", dest="numberOfQReads", action="store", type="int", help="The number of Query") + parser.add_option("-o", "--output", dest="outputGff3FileName", action="store", type="string", help="output file [compulsory] [format: output file in gff3 format]") + (options, args) = parser.parse_args() + + outputDataName = 'timeResult.dat' + fTime = open(outputDataName, 'w') + fTime.write('NbRef\tNbQuery\tNbOverlap\ttime\n') + chromSize = 100000 + print 'ref size = %d, query size = %d' %(options.numberOfRefReads, options.numberOfQReads) + iMFOR_ref = MockFindOverlaps_randomExample(options.inputRefGff3FileName, 'ref', options.numberOfRefReads, chromSize) + iMFOR_ref.write() + cmd_ref = 'sort -f -n -k4 -k5.4rn -o %s %s' % (options.inputRefGff3FileName, options.inputRefGff3FileName) + os.system(cmd_ref) + iMFOR_query = MockFindOverlaps_randomExample(options.inputQueryGff3FileName,'q', options.numberOfQReads, chromSize) + iMFOR_query.write() + cmd_query = 'sort -f -n -k4 -k5.4rn -o %s %s' % (options.inputQueryGff3FileName, options.inputQueryGff3FileName) + os.system(cmd_query) + iFOO = FindOverlaps_optim(options.inputRefGff3FileName, options.inputQueryGff3FileName) + iFOO.setOutputGff3FileName(options.outputGff3FileName) + + startTime_optim = time.time() + iFOO.run() + iFOO.close() + nbOverlap = iFOO.getNbOverlap() + endTime_optim = time.time() + cmd = 'sort -f -n -k4 -k5.4rn -k9.5 -t ";" -o %s %s' % (options.outputGff3FileName, options.outputGff3FileName) + os.system(cmd) + totalTime_optim = endTime_optim - startTime_optim + print 'we take %s second.' % (totalTime_optim) + fTime.write('%d\t%d\t%d\t%.2f\n'%(options.numberOfRefReads, options.numberOfQReads, nbOverlap, totalTime_optim)) + iFOO.deletIntermediateFiles() + fTime.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/selectByNbOccurrences.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/selectByNbOccurrences.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,89 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Select the transcript that have not more that a given number of occurrences""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Select by # of Occurrences v1.0.1: Keep the reads which have mapped less than a given number of times. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-n", "--occurrences", dest="occurrences", action="store", default=1, type="int", help="maximum number of occurrences allowed [format: int] [default: 1]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + # get occurrences of the transcripts + names = dict() + progress = Progress(parser.getNbTranscripts(), "Reading names of %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + name = transcript.name + if name not in names: + names[name] = 1 + else: + names[name] += 1 + progress.inc() + progress.done() + + # write output file + nbWritten = 0 + writer = Gff3Writer(options.outputFileName, options.verbosity) + if options.mysql: + mysqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Writing transcripts", options.verbosity) + for transcript in parser.getIterator(): + name = transcript.name + if names[name] <= options.occurrences: + nbWritten += 1 + writer.addTranscript(transcript) + if options.mysql: + mysqlWriter.addTranscript(transcript) + progress.inc() + progress.done() + + if options.mysql: + mysqlWriter.write() + print "%d input" % (parser.getNbTranscripts()) + print "%d output (%.2f%%)" % (nbWritten, float(nbWritten) / parser.getNbTranscripts() * 100) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/sequenceListSplitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/sequenceListSplitter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,73 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Split a FASTA file into several shorter ones""" + +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import * +from commons.core.writer.FastaWriter import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Sequence List Splitter v1.0.1: Split a list of big sequences into small chunks. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileNames", action="store", type="string", help="output files [compulsory] [format: output file in FASTA format]") + parser.add_option("-n", "--number", dest="number", action="store", default=10, type="int", help="number of splits [compulsory] [format: int] [default: 10]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open(options.outputFileNames + ".log", "w") + + # split file + sequenceListParser = SequenceListParser(options.inputFileName, options.verbosity) + nbSequences = sequenceListParser.getNbSequences() + nbSequencesByFile = math.ceil(nbSequences / options.number) + + # write into files + currentFileNumber = 1 + writer = FastaWriter("%s%i.fasta" % (options.outputFileNames, currentFileNumber), options.verbosity) + nbSequencesHere = 0 + progress = Progress(nbSequences, "Writing files", options.verbosity) + for sequence in sequenceListParser.getIterator(): + writer.addSequence(sequence) + nbSequencesHere += 1 + if nbSequencesHere == nbSequencesByFile: + currentFileNumber += 1 + writer = FastaWriter("%s%i.fasta" % (options.outputFileNames, currentFileNumber), options.verbosity) + nbSequencesHere = 0 + progress.inc() + progress.done() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/splitByTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/splitByTag.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,68 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Read a file and split it into several, depending on a tag""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Split By Tag v1.0.1: Read a file and split it into several, depending on a tag. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="tag on which the split is made [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in CSV format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + transcriptContainer = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writers = dict() + + progress = Progress(transcriptContainer.getNbTranscripts(), "Reading file %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptContainer.getIterator(): + value = transcript.getTagValue(options.tag) + if value == None: + value = "noTag" + value = str(value).replace(" ", "_").title() + if value not in writers: + writers[value] = Gff3Writer("%s.gff3" % (os.path.join(options.outputFileName, value))) + writers[value].addTranscript(transcript) + + progress.inc() + progress.done() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/splitMultiFasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/splitMultiFasta.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,64 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Split a Multi-Fasta file to several Fasta files""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.Gff3Writer import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Split Multi-Fasta v1.0.1: Split a Multi-Fasta file to several Fasta files. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file 1 [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in FASTA format]") + (options, args) = parser.parse_args() + + inputHandle = open(options.inputFileName) + outputHandle = None + + for line in inputHandle: + line = line.strip() + if line[0] == ">": + if outputHandle != None: + outputHandle.close() + name = line[1:].split(" ")[0] + outputHandle = open("%s%s.fasta" % (options.outputFileName, name), "w") + outputHandle.write("%s\n" % (line)) + + inputHandle.close() + outputHandle.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/Bins.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Bins.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,77 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Some functions about bins +""" + +def getMinBin(): + return 3 + + +def getMaxBin(): + return 7 + + +def getBin(start, end): + for i in range(getMinBin(), getMaxBin() + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (getMaxBin() + 1) + int(start / binLevel)) + return int((getMaxBin() + 1) * 10 ** (getMaxBin() + 1)) + + +def getOverlappingBins(start, end): + array = [] + bigBin = int((getMaxBin() + 1) * 10 ** (getMaxBin() + 1)) + for i in range(getMinBin(), getMaxBin() + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (getMaxBin() + 1) + int(start / binLevel)), int(i * 10 ** (getMaxBin() + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +def getIterator(maxValue = None): + if maxValue == None: + maxValue = 10 ** (getMaxBin() + getMinBin()) - 1 + for i in range(getMinBin(), getMaxBin() + 1): + binLevel = 10 ** i + binBit = i * 10 ** (getMaxBin() + 1) + for j in range(0, maxValue / binLevel+1): + yield binBit + j + yield int((getMaxBin() + 1) * 10 ** (getMaxBin() + 1)) + + +def getNbBins(maxValue = None): + if maxValue == None: + maxValue = 10 ** (getMaxBin() + getMinBin()) - 1 + nbBins = 0 + for i in range(getMinBin(), getMaxBin() + 1): + nbBins += maxValue / 10 ** i + return nbBins + 1 diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/Interval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Interval.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,706 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +from SMART.Java.Python.structure.Bins import * +from commons.core.coord.Range import Range + +class Interval(Range): + """ + Store a genomic interval + @ivar name: name of the interval [optional] + @type name: string + @ivar id: id of the interval [optional] + @type id: int + @ivar bin: bin in which the interval should be if stored in a database [computed] + @type bin: int + @ival tags: information about the transcript [optional] + @type tags: dict + @ivar verbosity: verbosity + @type verbosity: int [default: 0] + """ + + def __init__(self, interval = None, verbosity = 0): + """ + Constructor + @param interval: interval to be copied + @type interval: class L{Interval} + @param verbosity: verbosity + @type verbosity: int + """ + Range.__init__(self) + self.name = None + self.id = None + self.bin = None + self.verbosity = verbosity + self.tags = {} + if interval != None: + self.copy(interval) + + #!!!! Warning: two methods getStart() and getEnd() give the information maximum and minimum in interval.!!!!# + #In case strand = "+", start < end; strand = "-", start > end + def getStart(self): + if self.start == -1: + return -1 + if self.end == -1: + return self.start + return self.getMin() + + + def getEnd(self): + if self.end == -1: + return -1 + if self.start == -1: + return self.end + return self.getMax() + + + def getChromosome(self): + return self.getSeqname() + + + def getDirection(self): + return 1 if self.getStrand() == "+" else -1 + + + def getName(self): + return self.name + + + def isSet(self): + """ + Check if the interval is set + """ + return self.getStart() == None and self.getEnd() == None + + + def copy(self, interval): + """ + Copy method + @param interval: interval to be copied + @type interval: class L{Interval} + """ + self.setStart(interval.getStart()) + self.setEnd(interval.getEnd()) + self.setChromosome(interval.getChromosome()) + self.setDirection(interval.getDirection()) + self.name = interval.name + self.id = interval.id + self.bin = interval.bin + self.tags = {} + for tag in interval.tags: + self.tags[tag] = interval.tags[tag] + self.verbosity = interval.verbosity + + + def setName(self, name): + """ + Set the name + @param name: name of the interval + @type name: string + """ + if len(name) > 100: + name = name[:100] + self.name = name + + + def setChromosome(self, chromosome=""): + """ + Set the chromosome + @param chromosome: chromosome on which the interval is + @type chromosome: string + """ + if not chromosome: + self.seqname = None + else: + self.seqname = chromosome.replace(".", "_").replace("|", "_") + + + def setStart(self, start): + """ + Set the start point + Possibly reset bin + @param start: start point of the interval + @type start: int + """ + self.bin = None + direction = self.getDirection() + if self.start == -1: + self.start = start + elif self.end == -1: + self.end = start + else: + if direction == 1: + self.start = start + else: + self.end = start + if direction == 1: + self.start, self.end = min(self.start, self.end), max(self.start, self.end) + else: + self.start, self.end = max(self.start, self.end), min(self.start, self.end) + + + def setEnd(self, end): + """ + Set the end point + Possibly reset bin + @param end: end point of the interval of the interval + @type end: int + """ + self.bin = None + direction = self.getDirection() + if self.end == -1: + self.end = end + elif self.start == -1: + self.start = end + else: + if direction == 1: + self.end = end + else: + self.start = end + if direction == 1: + self.start, self.end = min(self.start, self.end), max(self.start, self.end) + else: + self.start, self.end = max(self.start, self.end), min(self.start, self.end) + + + def setSize(self, size): + """ + Possibly modify the end point + @param size: size of the transcript + @type size: int + """ + if self.end == None and self.start != None: + self.setEnd(self.start + self.getSize() - 1) + elif self.start == None and self.end != None: + self.setStart(self.end - self.getSize() + 1) + + + def getSize(self): + """ + Get the size + """ + return self.getEnd() - self.getStart() + 1 + + + def _setDirection(self, direction): + """ + Set the direction of the interval (connection to Range) + @param direction: direction of the transcript (+ / -) + @type direction: int (1 or -1) + """ + if direction * self.getDirection() < 0: + self.reverse() + + + def setDirection(self, direction): + """ + Set the direction of the interval + Possibly parse different formats + @param direction: direction of the transcript (+ / -) + @type direction: int or string + """ + if type(direction).__name__ == 'int': + self._setDirection(direction / abs(direction)) + elif type(direction).__name__ == 'str': + if direction == "+": + self._setDirection(1) + elif direction == "-": + self._setDirection(-1) + elif direction == "1" or direction == "-1": + self._setDirection(int(direction)) + elif direction.lower() == "plus": + self._setDirection(1) + elif direction.lower() == "minus": + self._setDirection(-1) + else: + raise Exception("Cannot understand direction %s" % (direction)) + else: + raise Exception("Cannot understand direction %s" % (direction)) + + + def extendStart(self, size): + """ + Extend the interval by the 5' end + @param size: the size to be exended + @type size: int + """ + if self.getDirection() == 1: + self.setStart(max(0, self.getStart() - size)) + else: + self.setEnd(self.getEnd() + size) + self.bin = None + + + def extendEnd(self, size): + """ + Extend the interval by the 3' end + @param size: the size to be exended + @type size: int + """ + if self.getDirection() == 1: + self.setEnd(self.getEnd() + size) + else: + self.setStart(max(0, self.getStart() - size)) + self.bin = None + + + def restrictStart(self, size = 1): + """ + Restrict the interval by some nucleotides, start from its start position + Remove the exons + @param size: the size to be restricted to + @type size: int + """ + if self.getDirection() == 1: + self.setEnd(min(self.getEnd(), self.getStart() + size - 1)) + else: + self.setStart(max(self.getStart(), self.getEnd() - size + 1)) + self.bin = None + + + def restrictEnd(self, size = 1): + """ + Restrict the interval by some nucleotides, end from its end position + Remove the exons + @param size: the size to be restricted to + @type size: int + """ + if self.getDirection() == 1: + self.setStart(max(self.getStart(), self.getEnd() - size + 1)) + else: + self.setEnd(min(self.getEnd(), self.getStart() + size - 1)) + self.bin = None + + + + def setTagValue(self, name, value): + """ + Set a tag + @param name: name of the tag + @type name: string + @param value: value of the tag + @type value: int or string + """ + self.tags[name] = value + + + def getTagNames(self): + """ + Get all the names of the tags + """ + return self.tags.keys() + + + def getTagValue(self, tag): + """ + Get the value of a tag + @param tag: name of a tag + @type tag: string + """ + if tag not in self.tags: + return None + return self.tags[tag] + + + def getTagValues(self, tagSep = "; ", fieldSep = " ", surrounder = ""): + """ + Get the formatted tag values + @param tagSep: separator between tags + @type tagSep: string + @param fieldSep: separator between tag name and tag value + @type fieldSep: string + @param surrounder: string which optionally surround values + @type surrounder: string + """ + tags = [] + for name, value in self.tags.iteritems(): + if value == None: + continue + if isinstance(value, basestring): + tags.append("%s%s%s%s%s" % (name, fieldSep, surrounder, value.replace("'", "\\'"), surrounder)) + elif type(value) is int: + tags.append("%s%s%s%i%s" % (name, fieldSep, surrounder, value, surrounder)) + elif type(value) is float: + tags.append("%s%s%s%f%s" % (name, fieldSep, surrounder, value, surrounder)) + else: + raise Exception("Do not know how to print '" + value + "'.") + if self.getName() != None: + tags.append("%s%s%s%s%s" % ("Name", fieldSep, surrounder, self.getName(), surrounder)) + return tagSep.join(tags) + + + def setTagValues(self, tags, tagSep = "; ", fieldSep = " "): + """ + Set the tag values using given string + @param tags: the tags, concatenated + @type tags: string + @param tagSep: separator between tags + @type tagSep: string + @param fieldSep: separator between tag name and tag value + @type fieldSep: string + """ + if tags == "": + self.tags = {} + return + for splittedTag in tags.split(tagSep): + if fieldSep not in splittedTag: + raise Exception("Weird field '%s' in tags '%s'" % (splittedTag, tags)) + tag, value = splittedTag.split(fieldSep, 1) + if tag == "Name": + self.setName(value) + continue + try: + intValue = int(value) + self.tags[tag] = intValue + except ValueError: + try: + floatValue = float(value) + self.tags[tag] = floatValue + except ValueError: + self.tags[tag] = value + + + def deleteTag(self, tag): + """ + Remove a tag + @param tag: the tag to be removed + @type tag: string + """ + if tag in self.tags: + del self.tags[tag] + + + def setNbOccurrences(self, nbOccurrences): + """ + Set the number of occurrences of the interval + @param nbOccurrences: number of occurrences of the interval + @type nbOccurrences: int + """ + self.setTagValue("nbOccurrences", nbOccurrences) + + + def setOccurrence(self, occurrence): + """ + Set the occurrence of this interval + @param occurrence: an occurrence for this transcript + @type occurrence: int + """ + self.setTagValue("occurrence", occurrence) + + def __eq__(self, interval): + """ + Whether two intervals are equal (start and end at same position) + @param interval: object to be compared to + @type interval: class L{Interval} + """ + if not interval: + return False + return self.getChromosome() == interval.getChromosome() and self.getStart() == interval.getStart() and self.getEnd() == interval.getEnd() and self.getDirection() == interval.getDirection() + + + def overlapWith(self, interval, nbNucleotides = 1): + """ + Whether two intervals overlap + @param interval: object to be compared to + @type interval: class L{Interval} + @param nbNucleotides: minimum number of nucleotides to declare and overlap + @type nbNucleotides: int + """ + if self.getChromosome() != interval.getChromosome(): + return False + return (min(self.getEnd(), interval.getEnd()) - max(self.getStart(), interval.getStart()) + 1 >= nbNucleotides) + + def isIncludeIn(self, interval): + return interval.include(self) + + + def include(self, interval): + """ + Whether this interval includes the other one + @param interval: object to be compared to + @type interval: class L{Interval} + """ + if self.getChromosome() != interval.getChromosome(): + return False + return ((self.getStart() <= interval.getStart()) and (self.getEnd() >= interval.getEnd())) + + + def getDifference(self, interval, sameStrand = False): + """ + Get the difference between this cluster and another one + @param interval: object to be compared to + @type interval: class L{Interval} + @param sameStrand: do the comparison iff the intervals are on the same strand + @type sameStrand: boolean + @return: a (possibly empty) list of intervals + """ + newInterval = Interval() + newInterval.copy(self) + if self.getChromosome() != interval.getChromosome(): + return [newInterval] + if not self.overlapWith(interval): + return [newInterval] + if sameStrand and self.getDirection() != interval.getDirection(): + return [newInterval] + intervals = [] + if self.getStart() < interval.getStart(): + newInterval = Interval() + newInterval.copy(self) + newInterval.setEnd(min(self.getEnd(), interval.getStart() - 1)) + intervals.append(newInterval) + if self.getEnd() > interval.getEnd(): + newInterval = Interval() + newInterval.copy(self) + newInterval.setStart(max(self.getStart(), interval.getEnd() + 1)) + intervals.append(newInterval) + return intervals + + + def getIntersection(self, interval): + """ + Get the intersection between this interval and another one + @param interval: object to be compared to + @type interval: class L{Interval} + @return: an other interval + """ + if not self.overlapWith(interval): + return None + newInterval = Interval() + newInterval.setChromosome(self.getChromosome()) + newInterval.setDirection(self.getDirection()) + newInterval.setName("%s_intersect_%s" % (self.getName(), interval.getName())) + newInterval.setStart(max(self.getStart(), interval.getStart())) + newInterval.setEnd(min(self.getEnd(), interval.getEnd())) + return newInterval + + + def getDistance(self, interval): + """ + Get the distance between two intervals (a non-negative value) + @param interval: another interval + @type interval: class L{Interval} + """ + if self.overlapWith(interval): + return 0 + if self.getChromosome() != interval.getChromosome(): + raise Exception("Cannot get the distance between %s and %s" % (str(self), str(interval))) + return min(abs(self.getStart() - interval.getEnd()), abs(self.getEnd() - interval.getStart())) + + + def getRelativeDistance(self, interval): + """ + Get the distance between two intervals (negative if first interval is before) + @param interval: another interval + @type interval: class L{Interval} + """ + if self.overlapWith(interval): + return 0 + if self.getChromosome() != interval.getChromosome(): + raise Exception("Cannot get the distance between %s and %s" % (str(self), str(interval))) + if self.getEnd() < interval.getStart(): + distance = interval.getStart() - self.getEnd() + else: + distance = interval.getEnd() - self.getStart() + distance *= self.getDirection() + return distance + + + def merge(self, interval, normalization = False): + """ + Merge two intervals + @param interval: another interval + @type interval: class L{Interval} + @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements + @type normalization: boolean + """ + if self.getChromosome() != interval.getChromosome(): + raise Exception("Cannot merge '%s' and '%s' for they are on different chromosomes." % (str(self), str(interval))) + direction = None + if self.getStart() == self.getEnd(): + direction = interval.getDirection() + elif interval.getStart() == interval.getEnd(): + direction = self.getDirection() + elif self.getDirection() != interval.getDirection(): + raise Exception("Cannot merge '%s' and '%s' for they are on different strands." % (str(self), str(interval))) + self.setStart(min(self.getStart(), interval.getStart())) + self.setEnd(max(self.getEnd(), interval.getEnd())) + if direction != None: + self.setDirection(direction) + nbElements = 0.0 + for element in (self, interval): + for tagName in ("nbElements", "nbOccurrences"): + if tagName not in element.getTagNames(): + element.setTagValue(tagName, 1) + nbElements += float(element.getTagValue("nbElements")) / float(element.getTagValue("nbOccurrences")) if normalization else float(element.getTagValue("nbElements")) + self.setTagValue("nbElements", nbElements) + self.bin = None + for tagName in ("identity", "nbOccurrences", "occurrence", "nbMismatches", "nbGaps", "rank", "evalue", "bestRegion"): + if tagName in self.getTagNames(): + del self.tags[tagName] + + + def getBin(self): + """ + Get the bin of the interval + Computed on the fly + """ + if self.bin == None: + self.bin = getBin(self.getStart(), self.getEnd()) + return self.bin + + + def getBins(self): + """ + Get all the bin this interval could fall into + """ + return getOverlappingBins(self.getStart(), self.getEnd()) + + + def getSqlVariables(cls): + """ + Get the properties of the object that should be saved in a database + """ + variables = ["name", "chromosome", "start", "end", "direction", "tags", "bin"] + return variables + getSqlVariables = classmethod(getSqlVariables) + + + def setSqlValues(self, array): + """ + Set the values of the properties of this object as given by a results line of a SQL query + """ + self.id = array[0] + self.name = array[1].strip("'") + self.setChromosome(array[2].strip("'")) + self.setStart(array[3]) + self.setEnd(array[4]) + self.setDirection(array[5]) + self.setTagValues(array[6].strip("'"), ";", "=") + self.bin = array[7] + + + def getSqlValues(self): + """ + Get the values of the properties that should be saved in a database + """ + values = dict() + values["name"] = self.name + values["chromosome"] = self.getChromosome() + values["start"] = self.getStart() + values["end"] = self.getEnd() + values["direction"] = self.getDirection() + values["tags"] = self.getTagValues(";", "=") + values["bin"] = self.getBin() + return values + + + def getSqlTypes(cls): + """ + Get the values of the properties that should be saved in a database + """ + types = dict() + types["name"] = "varchar" + types["chromosome"] = "varchar" + types["start"] = "int" + types["end"] = "int" + types["direction"] = "tinyint" + types["tags"] = "varchar" + types["bin"] = "int" + return types + getSqlTypes = classmethod(getSqlTypes) + + + def getSqlSizes(cls): + """ + Get the sizes of the properties that should be saved in a database + """ + sizes = dict() + sizes["name"] = 255 + sizes["chromosome"] = 255 + sizes["start"] = 11 + sizes["end"] = 11 + sizes["direction"] = 4 + sizes["tags"] = 1023 + sizes["bin"] = 11 + return sizes + getSqlSizes = classmethod(getSqlSizes) + + + def printCoordinates(self): + """ + Print start and end positions (depending on the direction of the interval) + """ + if self.getDirection() == 1: + return "%d-%d" % (self.getStart(), self.getEnd()) + else: + return "%d-%d" % (self.getEnd(), self.getStart()) + + + def extractSequence(self, parser): + """ + Get the sequence corresponding to this interval + @param parser: a parser to a FASTA file + @type parser: class L{SequenceListParser} + @return : a instance of L{Sequence} + """ + return parser.getSubSequence(self.getChromosome(), self.getStart(), self.getEnd(), self.getDirection(), self.name) + + + def extractWigData(self, parser): + """ + Get the data retrieved from a wig file + @param parser: a parser class to a WIG file + @type parser: class L{WigParser} + """ + data = parser.getRange(self.getChromosome(), self.getStart(), self.getEnd()) + if self.getDirection() == -1: + if parser.strands: + newData = {} + for strand in data: + data[strand].reverse() + newData[-strand] = data[strand] + data = newData + else: + data.reverse() + return data + + + def __str__(self): + """ + Output a simple representation of this interval + """ + direction = "+" + if self.getDirection() == -1: + direction = "-" + string = "%s:%d-%d (%s)" % (self.getChromosome(), self.getStart(), self.getEnd(), direction) + if self.name != "": + string = "(%s) %s" % (self.name, string) + return string + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/Mapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Mapping.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,255 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval + +class Mapping(object): + """A class that represents a mapping""" + + def __init__(self): + self.targetInterval = None + self.queryInterval = None + self.subMappings = [] + self.size = None + self.transcript = None + self.tags = {} + + + def copy(self, mapping): + for subMapping in mapping.subMappings: + newSubMapping = SubMapping(subMapping) + self.addSubMapping(newSubMapping) + self.targetInterval = Interval(mapping.targetInterval) + self.queryInterval = Interval(mapping.queryInterval) + self.size = mapping.size + self.tags = {} + for tag in mapping.tags: + self.tags[tag] = mapping[tag] + self.transcript.copy(mapping.transcript) + + + def setTargetInterval(self, interval): + self.targetInterval = Interval(interval) + if self.queryInterval != None: + self.setDirection(self.targetInterval.getDirection() * self.queryInterval.getDirection()) + + + def setQueryInterval(self, interval): + self.queryInterval = Interval(interval) + if self.targetInterval != None: + self.setDirection(self.targetInterval.getDirection() * self.queryInterval.getDirection()) + + + def getQueryInterval(self): + return self.queryInterval + + + def addSubMapping(self, subMapping): + subMappingCopy = SubMapping(subMapping) + self.subMappings.append(subMappingCopy) + + if self.targetInterval: + self.targetInterval.setStart(min(self.targetInterval.getStart(), subMapping.targetInterval.getStart())) + self.targetInterval.setEnd(max(self.targetInterval.getEnd(), subMapping.targetInterval.getEnd())) + else: + self.setTargetInterval(subMapping.targetInterval) + if self.queryInterval: + self.queryInterval.setStart(min(self.queryInterval.getStart(), subMapping.queryInterval.getStart())) + self.queryInterval.setEnd(max(self.queryInterval.getEnd(), subMapping.queryInterval.getEnd())) + else: + self.setQueryInterval(subMapping.queryInterval) + + if self.getDirection() != 0: + subMapping.setDirection(self.getDirection()) + if self.size == None: + self.size = 0 + if "identity" in subMapping.getTagNames() and "identity" not in self.getTagNames(): + self.setTagValue("identity", subMapping.getTagValue("identity")) + elif "identity" in subMapping.getTagNames() and subMapping.size != None: + self.setTagValue("identity", (self.getTagValue("identity") * self.size + subMapping.getTagValue("identity") * subMapping.size) / (self.size + subMapping.size)) + if subMapping.size != None: + self.size += subMapping.size + if "nbMismatches" in subMapping.getTagNames() and "nbMismatches" not in self.getTagNames(): + self.setTagValue("nbMismatches", subMapping.getTagValue("nbMismatches")) + elif "nbMismatches" in subMapping.getTagNames(): + self.setTagValue("nbMismatches", self.getTagValue("nbMismatches") + subMapping.getTagValue("nbMismatches")) + if "nbGaps" in subMapping.getTagNames() and "nbGaps" not in self.getTagNames(): + self.setTagValue("nbGaps", subMapping.getTagValue("nbGaps")) + elif "nbGaps" in subMapping.getTagNames(): + self.setTagValue("nbGaps", self.getTagValue("nbGaps") + subMapping.getTagValue("nbGaps")) + + + def setDirection(self, direction): + for subMapping in self.subMappings: + subMapping.setDirection(direction) + + + def getDirection(self): + if not self.subMappings: + raise Exception("Error! Mapping '%s' has no submapping" % (self)) + return self.subMappings[0].getDirection() + + + def setSize(self, size): + self.size = size + if "identity" in self.getTagNames(): + self.setTagValue("nbMismatches", self.size - round(self.size * self.getTagValue("identity") / 100.0)) + + + def setTagValue(self, name, value): + self.tags[name] = value + self.transcript = None + + + def getTagValue(self, name): + return self.tags[name] + + + def getTagNames(self): + return self.tags.keys() + + + def setIdentity(self, identity): + self.setTagValue("identity", identity) + if self.size != None and "nbMismatches" not in self.getTagNames(): + nbMismatches = 0 if self.size == 0 else self.size - round(self.size * self.getTagValue("identity") / 100.0) + self.setTagValue("nbMismatches", nbMismatches) + + + def setNbOccurrences(self, nbOccurrences): + self.setTagValue("nbOccurrences", nbOccurrences) + + + def setNbMismatches(self, nbMismatches): + self.setTagValue("nbMismatches", nbMismatches) + if self.size != None and "identity" not in self.getTagNames(): + identity = 100 if self.size == 0 else (self.size - self.getTagValue("nbMismatches")) / float(self.size) * 100 + self.setTagValue("identity", identity) + + + def setNbGaps(self, nbGaps): + self.setTagValue("nbGaps", nbGaps) + + + def setRank(self, rank): + self.setTagValue("rank", rank) + + + def setEvalue(self, evalue): + self.setTagValue("evalue", evalue) + + + def setOccurrence(self, occurrence): + self.setTagValue("occurrence", occurrence) + + + def setBestRegion(self, bestRegion): + self.setTagValue("bestRegion", bestRegion) + + + def mergeExons(self, distance): + previousSubMapping = None + subMappings = [] + for subMapping in self.subMappings: + if previousSubMapping == None: + subMappings.append(subMapping) + previousSubMapping = subMapping + else: + targetDistance = subMapping.targetInterval.getDistance(previousSubMapping.targetInterval) + queryDistance = subMapping.queryInterval.getDistance(previousSubMapping.queryInterval) + if targetDistance <= distance: + self.setTagValue("nbGaps", self.getTagValue("nbGaps") + queryDistance) + previousSubMapping.merge(subMapping) + else: + subMappings.append(subMapping) + previousSubMapping = subMapping + self.subMappings = subMappings + + + def getTranscript(self): + """ + Extract a transcript from this mapping + @return: a transcript + """ + if self.transcript != None: + return self.transcript + self.transcript = Transcript() + self.transcript.copy(self.targetInterval) + self.transcript.setDirection(self.getDirection()) + self.transcript.setName(self.queryInterval.getName()) + self.transcript.removeExons() + if len(self.subMappings) > 1: + for subMapping in self.subMappings: + self.transcript.addExon(subMapping.targetInterval) + cpt = 1 + for exon in self.transcript.exons: + exon.setDirection(self.transcript.getDirection()) + exon.setName("%s-exon%d" % (self.transcript.getName(), cpt)) + exon.setChromosome(self.transcript.getChromosome()) + cpt += 1 + self.transcript.setDirection(self.getDirection()) + self.transcript.sortExons() + for tag in self.tags: + if "bestRegion" not in self.getTagNames(): + self.transcript.setTagValue("bestRegion", "(self)") + self.transcript.setTagValue(tag, self.getTagValue(tag)) + return self.transcript + + + def getChromosome(self): + if not self.subMappings: + raise Exception("Error! Mapping '%s' has no submapping" % (self)) + return self.subMappings[0].targetInterval.getChromosome() + + + + def getErrorScore(self): + return self.getTagValue("nbGaps") * 3 + self.getTagValue("nbMismatches") + (len(self.subMappings) - 1) * 0.1 + + + def printGBrowseReference(self): + return self.getTranscript().printGBrowseReference() + + + def printGBrowseLine(self): + return self.getTranscript().printGBrowseLine() + + + def printGBrowse(self): + return self.getTranscript().printGBrowse() + + + def printBed(self): + return self.getTranscript().printBed() + + + def __str__(self): + return "%s ---- %s" % (str(self.getTranscript()), ", ". join([str(submapping) for submapping in self.subMappings])) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/Sequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Sequence.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,184 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import re +from commons.core.seq.Bioseq import Bioseq + +reverseComplementString = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "U": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "N": "N", + "a": "t", + "c": "g", + "g": "c", + "t": "a", + "u": "a", + "m": "k", + "r": "y", + "w": "w", + "s": "s", + "y": "r", + "k": "m", + "v": "b", + "h": "d", + "d": "h", + "b": "v", + "n": "n" +} + +class Sequence(Bioseq): + """A class that codes for a sequence""" + + def __init__(self, name = "", sequence = ""): + super(Sequence, self).__init__(name, sequence) + self.name = self.header + self.quality = None + self.chunkedSequence = None + self.chunkedQuality = None + self.integerQuality = False + + def setName(self, name=""): + super(Sequence, self).setHeader(name) + + def getName(self): + return self.getHeader() + + def setSequence(self, seq=""): + super(Sequence, self).setSequence(seq) + + def setQuality(self, quality): + if quality == None: + self.quality = None + return + if " " in quality: + self.quality = quality.split() + self.integerQuality = True + else: + self.quality = list(quality) + + def getQuality(self): + if self.quality == None: + return None + if self.integerQuality: + return " ".join(self.quality) + return "".join(self.quality) + + def getSize(self): + return len(self.getSequence()) + + + def copy(self, sequence): + self.setName(sequence.getName()) + self.setSequence(sequence.getSequence()) + self.setQuality(sequence.getQuality()) + self.chunkedSequence = None + self.chunkedQuality = None + + + def chunkSequence(self): + self.chunkedSequence = [] + for i in range (0, self.getSize() / 60 + 1): + self.chunkedSequence.append(self.getSequence()[i * 60 : min(self.getSize(), (i+1) * 60)]) + if self.quality != None: + self.chunkedQuality = [] + for i in range (0, self.getSize() / 60 + 1): + self.chunkedQuality.append(self.quality[i * 60 : min(self.getSize(), (i+1) * 60)]) + + def concatenate(self, seq): + sequence = self.getSequence() + sequence += seq.getSequence() + self.setSequence(sequence) + if self.quality != None: + sep = " " if self.integerQuality else "" + self.setQuality(self.getQuality() + sep + seq.getQuality()) + self.chunkedSequence = None + self.chunkedQuality = None + + + def printFasta(self): + if self.chunkedSequence == None: + self.chunkSequence() + return ">%s\n%s\n" % (self.getHeader(), "\n".join(self.chunkedSequence)) + + + def printFastq(self): + if self.chunkedSequence == None: + self.chunkSequence() + return "@%s\n%s\n+%s\n%s\n" % (self.getHeader(), self.getSequence(), self.getHeader(), self.getQuality()) + + + def reverseComplement(self): + seq = "" + self.chunkedSequence = None + self.chunkedQuality = None + for i in range(0, self.getSize()): + char = self.getSequence()[i:i+1] + if char not in reverseComplementString: + sys.exit("Cannot understand character %s from string %s" % (char, self.getSequence())) + seq = "%s%s" % (reverseComplementString[char], seq) + self.setSequence(seq) + if self.quality != None: + self.quality = self.quality[::-1] + + + def containsAmbiguousNucleotides(self): + m = re.search("[^ACGTUacgtu]", self.getSequence()) + if m != None: + return True + return False + + + def shrinkToFirstNucleotides(self, nbNucleotides): + self.chunkedSequence = None + self.chunkedQuality = None + self.setSequence(self.getSequence()[0:nbNucleotides]) + if self.quality != None: + self.quality = self.quality[0:nbNucleotides] + + + def shrinkToLastNucleotides(self, nbNucleotides): + self.chunkedSequence = None + self.chunkedQuality = None + self.setSequence(self.getSequence()[-nbNucleotides:]) + if self.quality != None: + self.quality = self.quality[-nbNucleotides:] diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/SequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/SequenceList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,72 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import math + +class SequenceList(object): + """A class that codes for a list of sequences""" + + def __init__(self, verbosity = 0): + self.sequences = [] + self.verbosity = verbosity + + + def nbSequences(self): + return len(self.sequences) + + + def getSequence(self, index): + return self.sequences[index] + + + def addSequence(self, sequence): + self.sequences.append(sequence) + + + def split(self, number): + sequenceLists = [] + size = math.ceil(self.nbSequences() / number) + + sequenceList = SequenceList() + for i in range(0, self.nbSequences()): + sequenceList.addSequence(self.getSequence(i)) + if (sequenceList.nbSequences() == size): + sequenceLists.append(sequenceList) + sequenceList = SequenceList() + if (sequenceList.nbSequences() != 0): + sequenceLists.append(sequenceList) + return sequenceLists + + + def printFasta(self): + string = "" + for sequence in self.sequences: + string += sequence.printFasta() + return string + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/SubMapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/SubMapping.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,258 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from SMART.Java.Python.structure.Interval import Interval +from commons.core.coord.Align import Align + +class SubMapping(Align): + """ + A class that represents a part of a mapping, more precisely, a pair (target interval, query interval) that match together + @ivar targetInterval: the target interval + @type targetInterval: class L{Interval} + @ivar queryInterval: the query interval + @type queryInterval: class L{Interval} + @ivar size: size of this sub-mapping + @type size: int + @ivar tags: various information + @type tags: dict + """ + + def __init__(self, subMapping = None): + """ + Constructor + @param subMapping: a sub-mapping to be copied + @type subMapping: class L{SubMapping} + """ + self.targetInterval = Interval() + self.queryInterval = Interval() + Align.__init__(self, self.queryInterval, self.targetInterval) + self.size = None + self.tags = {} + if subMapping != None: + self.copy(subMapping) + + def __eq__(self, o): + if o == None: + return False + areAlignAttributesEquals = Align.__eq__(self, o) + return areAlignAttributesEquals and (self.targetInterval == o.targetInterval) and (self.queryInterval == o.queryInterval) and self.size == o.getSize() and self.tags == o.getTags() + + def getSuperAdress(self): + return hex(id(super(Align, self))) + +# def setRangesAlignToRangesInterval(self): +# self.range_query = super(Range, self.queryInterval) +# self.range_subject = super(Range, self.targetInterval) + + def copy(self, subMapping): + """ + Copy method + @param subMapping: a sub-mapping to be copied + @type subMapping: class L{SubMapping} + """ + self.setQueryName(subMapping.getQueryName()) + self.setQueryStart(subMapping.getQueryStart()) + self.setQueryEnd(subMapping.getQueryEnd()) + self.setSubjectName(subMapping.getSubjectName()) + self.setSubjectStart(subMapping.getSubjectStart()) + self.setSubjectEnd(subMapping.getSubjectEnd()) + self.e_value = subMapping.getEvalue() + self.score = subMapping.getScore() + self.identity = subMapping.getIdentity() + + self.targetInterval.copy(subMapping.targetInterval) + self.queryInterval.copy(subMapping.queryInterval) + self.size = subMapping.size + for tag in subMapping.tags: + self.tags[tag] = subMapping.tags[tag] + + + def setTargetInterval(self, interval): + """ + Set target interval + @param targetInterval: the target interval of the sub-mapping + @type targetInterval: class L{Interval} + """ + self.targetInterval.copy(interval) + + + def setQueryInterval(self, interval): + """ + Set query interval + @param queryInterval: the query interval of the sub-mapping + @type queryInterval: class L{Interval} + """ + self.queryInterval.copy(interval) + + + def setSize(self, size): + """ + Set the size of the sub-mapping + Possibly also target and query interval sizes, as well as number of mismatches + @param size: the size of the sub-mapping + @type size: int + """ + self.size = size + if "identity" in self.getTagNames(): + self.setTagValue("nbMismatches", self.size - round(self.size * self.getTagValue("identity") / 100.0)) + + + def getDirection(self): + """ + Get the direction of the alignment + """ + return self.targetInterval.getDirection() + + + def setDirection(self, direction): + """ + Set the direction of the alignment + @param direction: the directio of the alignment + type direction: int or string + """ + return self.targetInterval.setDirection(direction) + + + def setTagValue(self, name, value): + """ + Set the value of a tag + @param name: name of the tag + @type name: string + @param value: value of the tag + @type value: string or int + """ + self.tags[name] = value + + + def getTagValue(self, name): + """ + Get the value of a tag + @param name: name of the tag + @type name: string + @return: value of the tag + """ + return self.tags[name] + + + def getTagNames(self): + """ + Get all the names of the tags + @return: the names of the tags + """ + return self.tags.keys() + + def getTargetInterval(self): + return self.targetInterval + + def getQueryInterval(self): + return self.queryInterval + + def getSize(self): + return self.size + + def getTags(self): + return self.tags + + def setIdentity(self, identity): + """ + Set the percentage of identity of the sub-mapping + Possibly also set number of mismatches + @param identity: the percentage of identity of the sub-mapping + @type identity: float + """ + self.identity = identity + self.setTagValue("identity", identity) + if self.size != None and "nbMismatches" not in self.getTagNames(): + self.setTagValue("nbMismatches", self.size - round(self.size * self.getTagValue("identity") / 100.0)) + + + def setNbMismatches(self, nbMismatches): + """ + Set the number of mismatches of the sub-mapping + Possibly also set percentage of identity + @param nbMismatches: the number of mismatches of the sub-mapping + @type nbMismatches: int + """ + self.nbMismatches = nbMismatches + if self.size != None and "identity" not in self.getTagNames(): + self.setTagValue("identity", (self.size - self.getTagValue("nbMismatches")) / float(self.size) * 100) + + + def setNbGaps(self, nbGaps): + """ + Set the number of gaps of the sub-mapping + @param nbGaps: the number of gaps of the sub-mapping + @type nbGaps: int + """ + self.setTagValue("nbGaps", nbGaps) + + + def merge(self, subMapping): + """ + Merge two subMappings + @param subMapping: another sub-mapping + @type subMapping: class L{SubMapping} + """ + self.targetInterval.merge(subMapping.targetInterval) + self.queryInterval.merge(subMapping.queryInterval) + + + def printCoordinates(self): + """ + Print the coordinates of the sub-mapping (considering the direction) + @return: a string + """ + if self.getDirection() == 1: + return "%d-%d" % (self.targetInterval.getStart(), self.targetInterval.getEnd()) + else: + return "%d-%d" % (self.targetInterval.getEnd(), self.targetInterval.getStart()) + + + def __str__(self): + """ + Return a representation of this object + @return: a string + """ + + if "match" in self.getTagNames() and not self.getTagValue("match"): + return "%s ---" % self.queryName + + direction = "+" + if self.getDirection() == -1: + direction = "-" + string = "%s:%d-%d -- %s:%d-%d (%s)" % (self.targetInterval.getChromosome(), self.targetInterval.getStart(), self.targetInterval.getEnd(), self.queryInterval.name, self.queryInterval.getStart(), self.queryInterval.getEnd(), direction) + if "nbMismatches" in self.getTagNames(): + string += "(%i mm)" % (self.getTagValue("nbMismatches")) + if "identity" in self.getTagNames(): + string += "(id: %i%%)" % (self.getTagValue("identity")) + if self.targetInterval.getSize() != None and self.queryInterval.getSize() != None and self.size != None: + string += "(sizes: %d, %d -> %d)" % (self.targetInterval.getSize(), self.queryInterval.getSize(), self.size) + return string + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/Transcript.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Transcript.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,876 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Sequence import Sequence + + +class Transcript(Interval): + """ + A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals) + @ivar exons: a list of exons (intervals) + @type exons: list of L{Interval{Interval}} + """ + + def __init__(self, transcript = None, verbosity = 0): + """ + Constructor + @param transcript: transcript to be copied + @type transcript: class L{Transcript} + @param verbosity: verbosity + @type verbosity: int + """ + super(Transcript, self).__init__(None, verbosity) + self.exons = [] + self.introns = None + if transcript != None: + self.copy(transcript) + + + def copy(self, transcript): + """ + Copy method + @param transcript: transcript to be copied + @type transcript: class L{Transcript} or L{Interval} + """ + super(Transcript, self).copy(transcript) + if transcript.__class__.__name__ == "Transcript": + exons = transcript.getExons() + if len(exons) > 1: + for exon in exons: + exonCopy = Interval(exon) + self.addExon(exonCopy) + + + def setDirection(self, direction): + """ + Set the direction of the interval + Possibly parse different formats + Impact all exons + @param direction: direction of the transcript (+ / -) + @type direction: int or string + """ + super(Transcript, self).setDirection(direction) + for exon in self.exons: + exon.setDirection(direction) + + + def setChromosome(self, chromosome): + """ + Set the chromosome + @param chromosome: chromosome on which the transcript is + @type chromosome: string + """ + super(Transcript, self).setChromosome(chromosome) + for exon in self.exons: + exon.setChromosome(chromosome) + + + def addExon(self, exon): + """ + Add an exon to the list of exons + @param exon: a new exon + @type exon: class L{Interval} + """ + if not self.exons and not exon.overlapWith(self): + firstExon = Interval() + firstExon.setStart(self.getStart()) + firstExon.setEnd(self.getEnd()) + firstExon.setDirection(self.getDirection()) + firstExon.setChromosome(self.getChromosome()) + self.exons.append(firstExon) + newExon = Interval(exon) + newExon.setDirection(self.getDirection()) + self.exons.append(newExon) + if newExon.getStart() < self.getStart(): + self.setStart(newExon.getStart()) + if newExon.getEnd() > self.getEnd(): + self.setEnd(newExon.getEnd()) + + + def setStart(self, start): + """ + Set the new start, move the first exon accordingly (if exists) + @param start: the new start + @type start: int + """ + super(Transcript, self).setStart(start) + if self.exons: + self.sortExonsIncreasing() + self.exons[0].setStart(start) + + + def setEnd(self, end): + """ + Set the new end, move the last exon accordingly (if exists) + @param end: the new end + @type end: int + """ + super(Transcript, self).setEnd(end) + if self.exons: + self.sortExonsIncreasing() + self.exons[-1].setEnd(end) + + + def reverse(self): + """ + Reverse the strand of the transcript + """ + super(Transcript, self).reverse() + for exon in self.exons: + exon.reverse() + + + def getUniqueName(self): + """ + Try to give a unique name by possibly adding occurrence + """ + if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1: + return "%s-%d" % (self.name, self.tags["occurrence"]) + return self.name + + + def getNbExons(self): + """ + Get the number of exons + """ + return max(1, len(self.exons)) + + + def getExon(self, i): + """ + Get a specific exon + @param i: the rank of the exon + @type i: int + """ + if len(self.exons) == 0: + if i != 0: + raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i)) + return self + return self.exons[i] + + + def getExons(self): + """ + Get all the exons + """ + if len(self.exons) == 0: + return [Interval(self)] + return self.exons + + + def getIntrons(self): + """ + Get all the introns + Compute introns on the fly + """ + if self.introns != None: + return self.introns + self.sortExons() + self.introns = [] + exonStart = self.getExon(0) + for cpt, exonEnd in enumerate(self.exons[1:]): + intron = Interval() + intron.setName("%s_intron%d" % (self.getName(), cpt+1)) + intron.setChromosome(self.getChromosome()) + intron.setDirection(self.getDirection()) + if self.getDirection() == 1: + intron.setEnd(exonEnd.getStart() - 1) + intron.setStart(exonStart.getEnd() + 1) + else: + intron.setStart(exonEnd.getEnd() + 1) + intron.setEnd(exonStart.getStart() - 1) + intron.setDirection(self.getDirection()) + if intron.getSize() > 0: + self.introns.append(intron) + exonStart = exonEnd + intron.setSize(intron.getEnd() - intron.getStart() + 1) + return self.introns + + + def getSize(self): + """ + Get the size of the transcript (i.e. the number of nucleotides) + Compute size on the fly + """ + if len(self.exons) == 0: + return self.getSizeWithIntrons() + size = 0 + for exon in self.exons: + size += exon.getSize() + return size + + + def getSizeWithIntrons(self): + """ + Get the size of the interval (i.e. distance from start to end) + """ + return super(Transcript, self).getSize() + + + def overlapWithExon(self, transcript, nbNucleotides = 1): + """ + Check if the exons of this transcript overlap with the exons of another transcript + @param transcript: transcript to be compared to + @type transcript: class L{Transcript} + @param nbNucleotides: minimum number of nucleotides to declare and overlap + @type nbNucleotides: int + """ + if not self.overlapWith(transcript, nbNucleotides): + return False + for thisExon in self.getExons(): + for thatExon in transcript.getExons(): + if thisExon.overlapWith(thatExon, nbNucleotides): + return True + return False + + + def include(self, transcript): + """ + Whether this transcript includes the other one + @param transcript: object to be compared to + @type transcript: class L{Transcript} + """ + if not super(Transcript, self).include(transcript): + return False + for thatExon in transcript.getExons(): + for thisExon in self.getExons(): + if thisExon.include(thatExon): + break + else: + return False + return True + + + def merge(self, transcript, normalization = False): + """ + Merge with another transcript + Merge exons if they overlap, otherwise add exons + @param transcript: transcript to be merged to + @type transcript: class L{Transcript} + @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements + @type normalization: boolean + """ + if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): + raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript)) + + theseExons = self.getExons() + thoseExons = transcript.getExons() + + for thatExon in thoseExons: + toBeRemoved = [] + for thisIndex, thisExon in enumerate(theseExons): + if thisExon.overlapWith(thatExon): + thatExon.merge(thisExon) + toBeRemoved.append(thisIndex) + theseExons.append(thatExon) + for thisIndex in reversed(toBeRemoved): + del theseExons[thisIndex] + self.removeExons() + self.setStart(min(self.getStart(), transcript.getStart())) + self.setEnd(max(self.getEnd(), transcript.getEnd())) + if len(theseExons) > 1: + for thisExon in theseExons: + self.addExon(thisExon) + + self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName())) + super(Transcript, self).merge(transcript, normalization) + + + def getDifference(self, transcript, sameStrand = False): + """ + Get the difference between this cluster and another one + @param transcript: object to be compared to + @type transcript: class L{Transcript} + @param sameStrand: do the comparison iff the transcripts are on the same strand + @type sameStrand: boolean + @return: a transcript + """ + newTranscript = Transcript() + newTranscript.copy(self) + if self.getChromosome() != transcript.getChromosome(): + return newTranscript + if not self.overlapWith(transcript): + return newTranscript + if sameStrand and self.getDirection() != transcript.getDirection(): + return newTranscript + newTranscript.removeExons() + if transcript.getEnd() > newTranscript.getStart(): + newTranscript.setStart(transcript.getEnd() + 1) + if transcript.getStart() < newTranscript.getEnd(): + newTranscript.setEnd(transcript.getStart() + 1) + theseExons = [] + for exon in self.getExons(): + exonCopy = Interval() + exonCopy.copy(exon) + theseExons.append(exonCopy) + for thatExon in transcript.getExons(): + newExons = [] + for thisExon in theseExons: + newExons.extend(thisExon.getDifference(thatExon)) + theseExons = newExons + if not theseExons: + return None + newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd() + for thisExon in theseExons[1:]: + newStart = min(newStart, thisExon.getStart()) + newEnd = max(newEnd, thisExon.getEnd()) + newTranscript.setEnd(newEnd) + newTranscript.setStart(newStart) + newTranscript.exons = theseExons + return newTranscript + + + def getIntersection(self, transcript): + """ + Get the intersection between this transcript and another one + @param transcript: object to be compared to + @type transcript: class L{Transcript} + @return: an other transcript + """ + if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): + return None + newTranscript = Transcript() + newTranscript.setDirection(self.getDirection()) + newTranscript.setChromosome(self.getChromosome()) + newTranscript.setName("%s_intersect_%s" % (self.getName(), transcript.getName())) + newExons = [] + for thisExon in self.getExons(): + for thatExon in transcript.getExons(): + newExon = thisExon.getIntersection(thatExon) + if newExon != None: + newExons.append(newExon) + if not newExons: + return None + newTranscript.exons = newExons + return newTranscript + + + def getSqlVariables(cls): + """ + Get the properties of the object that should be saved in a database + """ + variables = Interval.getSqlVariables() + variables.append("exons") + return variables + getSqlVariables = classmethod(getSqlVariables) + + + def setSqlValues(self, array): + """ + Set the values of the properties of this object as given by a results line of a SQL query + @param array: the values to be copied + @type array: a list + """ + super(Transcript, self).setSqlValues(array) + mergedExons = array[8] + if not mergedExons: + return + for exonCount, splittedExon in enumerate(mergedExons.split(",")): + start, end = splittedExon.split("-") + exon = Interval() + exon.setChromosome(self.getChromosome()) + exon.setDirection(self.getDirection()) + exon.setName("%s_exon%d" % (self.getName(), exonCount+1)) + exon.setStart(int(start)) + exon.setEnd(int(end)) + self.addExon(exon) + + + def getSqlValues(self): + """ + Get the values of the properties that should be saved in a database + """ + values = super(Transcript, self).getSqlValues() + values["size"] = self.getSize() + if self.getNbExons() == 1: + values["exons"] = "" + else: + values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()]) + return values + + + def getSqlTypes(cls): + """ + Get the types of the properties that should be saved in a database + """ + types = Interval.getSqlTypes() + types["exons"] = "varchar" + return types + getSqlTypes = classmethod(getSqlTypes) + + + def getSqlSizes(cls): + """ + Get the sizes of the properties that should be saved in a database + """ + sizes = Interval.getSqlSizes() + sizes["exons"] = 10000 + return sizes + getSqlSizes = classmethod(getSqlSizes) + + + def sortExons(self): + """ + Sort the exons + Increasing order if transcript is on strand "+", decreasing otherwise + """ + self.sortExonsIncreasing() + if self.getDirection() == -1: + exons = self.getExons() + exons.reverse() + self.exons = exons + + + def sortExonsIncreasing(self): + """ + Sort the exons + Increasing order + """ + exons = self.getExons() + sortedExons = [] + while len(exons) > 0: + minExon = exons[0] + for index in range(1, len(exons)): + if minExon.getStart() > exons[index].getStart(): + minExon = exons[index] + sortedExons.append(minExon) + exons.remove(minExon) + self.exons = sortedExons + + + def extendStart(self, size): + """ + Extend the transcript by the 5' end + @param size: the size to be extended + @type size: int + """ + if len(self.exons) != 0: + self.sortExons() + if self.getDirection() == 1: + self.exons[0].setStart(max(0, self.exons[0].getStart() - size)) + else: + self.exons[0].setEnd(self.exons[0].getEnd() + size) + super(Transcript, self).extendStart(size) + self.bin = None + + + def extendEnd(self, size): + """ + Extend the transcript by the 3' end + @param size: the size to be extended + @type size: int + """ + if len(self.exons) != 0: + self.sortExons() + if self.getDirection() == 1: + self.exons[-1].setEnd(self.exons[-1].getEnd() + size) + else: + self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size)) + super(Transcript, self).extendEnd(size) + self.bin = None + + + def extendExons(self, size): + """ + Extend all the exons + @param size: the size to be extended + @type size: int + """ + if len(self.exons) != 0: + self.sortExons() + exons = [] + previousExon = None + for exon in self.exons: + exon.extendStart(size) + exon.extendEnd(size) + exon.setDirection(self.getDirection()) + if previousExon != None and previousExon.overlapWith(exon): + previousExon.merge(exon) + else: + if previousExon != None: + exons.append(previousExon) + previousExon = exon + exons.append(previousExon) + self.exons = exons + super(Transcript, self).extendStart(size) + super(Transcript, self).extendEnd(size) + self.bin = None + + + def restrictStart(self, size = 1): + """ + Restrict the transcript by some nucleotides, start from its start position + Remove the exons + @param size: the size to be restricted to + @type size: int + """ + newExons = [] + if self.getDirection() == 1: + for exon in self.exons: + if exon.getStart() <= self.getStart() + size - 1: + if exon.getEnd() > self.getStart() + size - 1: + exon.setEnd(self.getStart() + size - 1) + newExons.append(exon) + else: + for exon in self.exons: + if exon.getEnd() >= self.getEnd() - size + 1: + if exon.getStart() < self.getEnd() - size + 1: + exon.setStart(self.getEnd() - size + 1) + newExons.append(exon) + super(Transcript, self).restrictStart(size) + self.exons = newExons + + + def restrictEnd(self, size = 1): + """ + Restrict the transcript by some nucleotides, end from its end position + Remove the exons + @param size: the size to be restricted to + @type size: int + """ + newExons = [] + if self.getDirection() == 1: + for exon in self.exons: + if exon.getEnd() >= self.getEnd() - size + 1: + if exon.getStart() < self.getEnd() - size + 1: + exon.setStart(self.getEnd() - size + 1) + newExons.append(exon) + else: + for exon in self.exons: + if exon.getEnd() >= self.getEnd() - size + 1: + if exon.getStart() < self.getEnd() - size + 1: + exon.setEnd(self.getEnd() - size + 1) + newExons.append(exon) + super(Transcript, self).restrictEnd(size) + self.exons = newExons + + + def removeExons(self): + """ + Remove the exons and transforms the current transcript into a mere interval + """ + self.exons = [] + self.bin = None + + + def printGtf(self, title): + """ + Export this transcript using GTF2.2 format + @param title: the title of the transcripts + @type title: string + @return: a string + """ + transcriptId = self.getUniqueName() + geneId = "%s_gene" % (transcriptId) + direction = "+" + if self.getDirection() == -1: + direction = "-" + self.sortExonsIncreasing() + string = "" + for i, exon in enumerate(self.getExons()): + exonCopy = Interval() + exonCopy.copy(exon) + if "ID" in exonCopy.getTagValues(): + del exonCopy.tags["ID"] + feature = "exon" + if "feature" in exonCopy.getTagNames(): + feature = exonCopy.getTagValue("feature") + del exonCopy.tags["feature"] + score = "." + if "score" in exonCopy.getTagNames(): + score = "%d" % (int(exonCopy.getTagValue("score"))) + del exonCopy.tags["score"] + if "Parent" in exonCopy.getTagNames(): + del exonCopy.tags["Parent"] + exonCopy.setName("%s_part%d" % (self.getName(), i+1)) + comment = exonCopy.getTagValues("; ", " ", "\"") + string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment) + return string + + + def printGff2(self, title): + """ + Export this transcript using GFF2 format + @param title: the title of the transcripts + @type title: string + @return: a string + """ + direction = "+" + if self.getDirection() == -1: + direction = "-" + self.sortExonsIncreasing() + comment = self.getTagValues() + if comment != None: + comment = ";%s" % (comment) + score = "." + if "score" in self.getTagNames(): + score = "%d" % (int(self.getTagValue("score"))) + feature = "transcript" + if "feature" in self.getTagNames(): + feature = self.getTagValue("feature") + string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment) + for exon in self.getExons(): + if "score" in exon.getTagNames(): + score = "%d" % (int(self.getTagValue("score"))) + string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name) + return string + + + def printGff3(self, title): + """ + Export this transcript using GFF3 format + @param title: the title of the transcripts + @type title: string + @return: a string + """ + direction = "+" + if self.getDirection() == -1: + direction = "-" + self.sortExonsIncreasing() + if "ID" not in self.getTagValues(): + self.setTagValue("ID", self.getUniqueName()) + feature = "transcript" + tags = self.tags + if "feature" in self.getTagNames(): + feature = self.getTagValue("feature") + del self.tags["feature"] + score = "." + if "score" in self.getTagNames(): + score = "%d" % (int(self.getTagValue("score"))) + del self.tags["score"] + comment = self.getTagValues(";", "=") + string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment) + if len(self.exons) > 1: + for i, exon in enumerate(self.getExons()): + if "score" in exon.getTagNames(): + score = "%d" % (int(exon.getTagValue("score"))) + string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID")) + self.tags = tags + return string + + + def printEmbl(self): + """ + Export this transcript using EMBL format + @return: a string + """ + if len(self.exons) <= 1: + position = "%d..%d" % (self.getStart(), self.getEnd()) + else: + positions = [] + for exon in self.getExons(): + positions.append("%d..%d" % (self.getStart(), self.getEnd())) + position = ",".join(positions) + position = "join(%s)" % (position) + if self.getDirection() == -1: + position = "complement(%s)" % (position) + feature = "misc_feature" + if "feature" in self.getTagNames(): + if not self.getTagValue("feature").startswith("S-MART"): + feature = self.getTagValue("feature") + string = "FT %s %s\n" % (feature, position) + if "Name" in self.getTagNames(): + string += "FT /label=\"%s\"\n" % (self.getTagValue("Name")) + return string + + + def printBed(self): + """ + Export this transcript using BED format + @return: a string + """ + name = self.name + if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"): + name = "%s-%d" % (name, self.getTagValue("occurrence")) + comment = self.getTagValues(";", "=") + sizes = [] + starts = [] + direction = "+" + if self.getDirection() == -1: + direction = "-" + self.sortExonsIncreasing() + for exon in self.getExons(): + sizes.append("%d" % (exon.getSize())) + starts.append("%d" % (exon.getStart() - self.getStart())) + return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) + + + def printSam(self): + """ + Export this transcript using SAM format + @return: a string + """ + name = self.name + flag = 0 if self.getDirection() == 1 else 0x10 + chromosome = self.getChromosome() + genomeStart = self.getStart() + quality = 255 + mate = "*" + mateGenomeStart = 0 + gapSize = 0 + sequence = "*" + qualityString = "*" + tags = "NM:i:0" + + lastExonEnd = None + self.sortExonsIncreasing() + exon = self.getExons()[0] + cigar = "%dM" % (self.getExons()[0].getSize()) + lastExonEnd = exon.getEnd() + for i, exon in enumerate(self.getExons()): + if i == 0: + continue + cigar += "%dN" % (exon.getStart() - lastExonEnd - 1) + cigar += "%dM" % (exon.getSize()) + + return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags) + + + def printUcsc(self): + """ + Export this transcript using UCSC BED format + @return: a string + """ + if self.getChromosome().find("Het") != -1: + return "" + name = self.name + comment = self.getTagValues(";", "") + sizes = [] + starts = [] + direction = "+" + if self.getDirection() == -1: + direction = "-" + self.sortExonsIncreasing() + for exon in self.getExons(): + sizes.append("%d" % (exon.getSize())) + starts.append("%d" % (exon.getStart() - self.getStart())) + return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) + + + def printGBrowseReference(self): + """ + Export this transcript using GBrowse format (1st line only) + @return: a string + """ + return "reference = %s\n" % (self.getChromosome()) + + + def printGBrowseLine(self): + """ + Export this transcript using GBrowse format (2nd line only) + @return: a string + """ + self.sortExons() + coordinates = [] + for exon in self.getExons(): + coordinates.append(exon.printCoordinates()) + coordinatesString = ",".join(coordinates) + comment = self.getTagValues(";", "=") + if comment: + comment = "\t\"%s\"" % (comment) + return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment) + + + def printGBrowse(self): + """ + Export this transcript using GBrowse format + @return: a string + """ + return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine()) + + + def printCsv(self): + """ + Export this transcript using CSV format + @return: a string + """ + self.sortExons() + string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-") + if len(self.getExons()) == 1: + string += "None" + else: + for exon in self.getExons(): + string += "%d-%d " % (exon.getStart(), exon.getEnd()) + for tag in sorted(self.tags.keys()): + string += ",%s=%s" % (tag, str(self.tags[tag])) + string += "\n" + return string + + + def extractSequence(self, parser): + """ + Get the sequence corresponding to this transcript + @param parser: a parser to a FASTA file + @type parser: class L{SequenceListParser} + @return: an instance of L{Sequence} + """ + self.sortExons() + name = self.name + if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name: + name += ":%s" % (self.getTagValue("ID")) + sequence = Sequence(name) + for exon in self.getExons(): + sequence.concatenate(exon.extractSequence(parser)) + return sequence + + + def extractWigData(self, parser): + """ + Get some wig data corresponding to this transcript + @param parser: a parser to a wig file + @type parser: class L{WigParser} + @return: a sequence of float + """ + self.sortExons() + if parser.strands: + strands = (-1, 1) + values = dict([(strand, []) for strand in strands]) + for exon in self.getExons(): + theseValues = exon.extractWigData(parser) + if self.getDirection() == -1: + for strand in strands: + theseValues[strand].reverse() + for strand in strands: + values[strand].extend(theseValues[strand]) + if self.getDirection() == -1: + for strand in strands: + values[strand].reverse() + return values + else: + values = [] + for exon in self.getExons(): + theseValues = exon.extractWigData(parser) + #if self.getDirection() == -1: + # theseValues.reverse() + values.extend(theseValues) + #if self.getDirection() == -1: + # values.reverse() + return values diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/TranscriptContainer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptContainer.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,236 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + +class TranscriptContainer(object): + """ + An interface class that contains a list of transcripts, handle different formats + @ivar container: container of the data + @type container: string + @ivar format: format of the data + @type format: string + @ivar transcriptListParser: possibly contains a parser to a list of transcripts + @type transcriptListParser: L{TranscriptListParser} or None + @ivar mappingListParser: possibly contains a parser to a list of mappings + @type mappingListParser: L{MapperParser} or None + @ivar transcriptTables: possibly contains the mySQL tables + @type transcriptTables: dict of L{MySqlTranscriptTable} or None + @ivar mySqlConnection: connection to a MySQL database + @type mySqlConnection: class L{MySqlConnection} + @ivar type: type of the data (transcripts, mappings or mySQL) + @type type: string + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, container, format, verbosity = 0): + """ + Constructor + @param container: container of the data + @type container: string + @param format: format of the data + @type format: string + @param verbosity: verbosity + @type verbosity: int + """ + self.container = container + self.format = format + self.verbosity = verbosity + self.transcriptListParser = None + self.mappingListParser = None + self.transcriptTables = {} + self.mySqlConnection = None + self.foundData = False + self.nbTranscripts = None + self.nbNucleotides = None + self.chromosomes = None + self.type = None + if self.container == None: + sys.exit("Error! Container input file name is empty!") + if self.format == None: + sys.exit("Error! Container input format is empty!") + + + def findData(self): + """ + Load data + """ + if self.format == None: + sys.exit("Error! Format is not specified!") + if self.format == "sql": + self.transcriptTables = {} + self.chromosomes = [] + self.nbTranscripts = 0 + self.nbNucleotides = 0 + self.type = "sql" + query = self.mySqlConnection.executeQuery("SELECT name FROM sqlite_master WHERE type LIKE 'table' AND name LIKE '%s_%%_transcripts'" % (self.container)) + for line in query.getIterator(): + tableName = line[0] + m = re.search(r"^(\S*)_transcripts$", tableName[len(self.container)+1:]) + if m == None: + sys.exit("Table '%s' has a strange name" % (tableName)) + chromosome = m.group(1) + self.transcriptTables[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.container, chromosome, self.verbosity) + self.chromosomes.append(chromosome) + for transcript in self.transcriptTables[chromosome].getIterator(): + self.nbTranscripts += 1 + self.nbNucleotides += transcript.getSize() + if self.type == None: + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(self.format) + self.type = parserChooser.getType() + if self.type == "transcript": + self.transcriptListParser = parserChooser.getParser(self.container) + elif self.type == "mapping": + self.mappingListParser = parserChooser.getParser(self.container) + else: + sys.exit("Error! Cannot handle format '%s'!" % (self.format)) + if self.type == None: + sys.exit("Error! Cannot handle format '%s'!" % (self.format)) + + if self.transcriptListParser != None: + if self.type == "transcript": + self.nbTranscripts = self.transcriptListParser.getNbTranscripts() + self.nbNucleotides = self.transcriptListParser.getNbNucleotides() + self.chromosomes = self.transcriptListParser.getChromosomes() + if self.mappingListParser != None: + if self.type == "mapping": + self.nbTranscripts = self.mappingListParser.getNbMappings() + self.nbNucleotides = self.mappingListParser.getNbNucleotides() + self.chromosomes = self.mappingListParser.getChromosomes() + + self.foundData = True + + + def getNbTranscripts(self): + """ + Get the number of transcripts + @return: the number of transcripts + """ + if not self.foundData: + self.findData() + return self.nbTranscripts + + + def getNbItems(self): + """ + Same as getNbTranscripts + """ + return self.getNbTranscripts() + + + def getNbNucleotides(self): + """ + Get the number of nucleotides + @return: the number of nucleotides + """ + if not self.foundData: + self.findData() + return self.nbNucleotides + + + def getChromosomes(self): + """ + Get the chromosomes + @return: the chromosomes + """ + if not self.foundData: + self.findData() + return self.chromosomes + + + def getIterator(self): + """ + An iterator + @return: an iterator to a list of transcripts + """ + if not self.foundData: + self.findData() + if self.type == "sql": + for chromosome in self.transcriptTables: + for transcript in self.transcriptTables[chromosome].getIterator(): + yield transcript + return + if self.type == "transcript": + for transcript in self.transcriptListParser.getIterator(): + yield transcript + return + if self.type == "mapping": + for mapping in self.mappingListParser.getIterator(): + yield mapping.getTranscript() + return + sys.exit("Error! No valid transcript container given!") + + + def storeIntoDatabase(self, name = None): + """ + Store the current transcript / mapping list into database + """ + if not self.foundData: + self.findData() + + if (self.transcriptListParser == None and self.mappingListParser == None) or len(self.transcriptTables.keys()) != 0: + return + + mySqlTranscriptWriter = MySqlTranscriptWriter(self.mySqlConnection, name, self.verbosity) + mySqlTranscriptWriter.addTranscriptList(self.transcriptListParser if self.transcriptListParser else self.mappingListParser) + mySqlTranscriptWriter.write() + self.transcriptTables = mySqlTranscriptWriter.getTables() + self.type = "sql" + + + def getTables(self): + """ + Accessor to the mySQL tables + @return: the mySQL tables + """ + return self.transcriptTables + + + def setDefaultTagValue(self, name, value): + """ + Set the given tag to the value for all transcripts + @param name: name of the tag + @type name: string + @param value: value of the tag + @type value: string + """ + if self.type == "sql": + for chromosome in self.transcriptTables: + self.transcriptTables[chromosome].setDefaultTagValue(name, value) + elif self.type == "transcript": + self.transcriptListParser.setDefaultTagValue(name, value) + elif self.type == "mapping": + self.mappingListParser.setDefaultTagValue(name, value) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/TranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptList.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,172 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.Progress import Progress + + +class TranscriptList(object): + """A class that codes for a list of transcript""" + + def __init__(self, verbosity = 0): + self.transcripts = dict() + self.longestTranscript = 0 + self.verbosity = verbosity + + + def getTranscript(self, chromosome, index): + return self.transcripts[chromosome][index] + + + def getChromosomes(self): + return self.transcripts.keys() + + + def getTranscriptsOnChromosome(self, chromosome): + if chromosome not in self.transcripts: + return [] + return self.transcripts[chromosome] + + + def addTranscript(self, transcript): + if transcript.getChromosome() in self.transcripts: + self.transcripts[transcript.getChromosome()].append(transcript) + else: + self.transcripts[transcript.getChromosome()] = [transcript] + self.longestTranscript = max(self.longestTranscript, transcript.getEnd() - transcript.getStart()) + + + def removeTranscript(self, chromosome, i): + del self.transcripts[chromosome][i] + + + def removeAll(self): + self.transcripts = {} + + + def getNbTranscripts(self): + nbTranscripts = 0 + for chromosome in self.transcripts: + nbTranscripts += len(self.transcripts[chromosome]) + return nbTranscripts + + + def getSize(self): + size = 0 + for chromosome in self.transcripts: + for transcript in self.transcripts[chromosome]: + size += transcript.getSize() + return size + + + def sort(self): + for chromosome in self.transcripts: + self.transcripts[chromosome].sort(lambda x, y: x.getStart() - y.getStart()) + + + def removeOverlapWith(self, transcriptList): + transcriptList.sort() + for chromosome in self.transcripts: + progress = Progress(len(self.transcripts[chromosome]), "Handling chromosome %s" % (chromosome), self.verbosity) + for thisTranscriptId in range(len(self.transcripts[chromosome])): + progress.inc() + for thatTranscriptId in range(len(transcriptList.transcripts[chromosome])): + if self.transcripts[chromosome][thisTranscriptId].overlapWith(transcriptList.transcripts[chromosome][thatTranscriptId]): + self.transcripts[chromosome][thisTranscriptId] = None + break + if self.transcripts[chromosome][thisTranscriptId].getEnd() > transcriptList.transcripts[chromosome][thatTranscriptId]: + break + self.transcripts[chromosome] = [transcript for transcript in self.transcripts[chromosome] if transcript != None] + progress.done() + + + def removeOverlapWithExon(self, transcriptList): + transcriptList.sort() + for chromosome in self.transcripts: + progress = Progress(len(self.transcripts[chromosome]), "Handling chromosome %s" % (chromosome), self.verbosity) + for thisTranscriptId in range(len(self.transcripts[chromosome])): + progress.inc() + for thatTranscriptId in range(len(transcriptList.transcripts[chromosome])): + if self.transcripts[chromosome][thisTranscriptId].overlapWithExon(transcriptList.transcripts[chromosome][thatTranscriptId]): + self.transcripts[chromosome][thisTranscriptId] = None + break + if self.transcripts[chromosome][thisTranscriptId].getEnd() > transcriptList.transcripts[chromosome][thatTranscriptId]: + break + self.transcripts[chromosome] = [transcript for transcript in self.transcripts[chromosome] if transcript != None] + progress.done() + + + def setDefaultTagValue(self, name, value): + for transcript in self.getIterator(): + transcript.setTag(name, value) + + + def storeDatabase(self, mySqlConnection): + transcriptsTable = MySqlTable("TmpTranscriptsTable", mySqlConnection) + transcriptsTable.create(Transcript.getSqlVariables(), Transcript.getSqlTypes()) + intervalsVariables = Interval.getSqlVariables() + intervalsVariables.append("idTranscript") + intervalsTypes = Interval.getSqlTypes() + intervalsTypes["idTranscript"] = "int" + intervalsTable = MySqlTable("TmpIntervalsTable", mySqlConnection) + intervalsTable.create(intervalsVariables, intervalsTypes) + for chromosome in self.transcripts: + for transcript in self.transcripts[chromosome]: + idTranscript = transcriptsTable.addLine(transcript.getSqlValues()) + for exon in transcript.getExons(): + intervalValues = exon.getSqlValues() + intervalValues["idTranscript"] = idTranscript + intervalsTable.addLine(intervalValues) + + + def getIterator(self): + chromosomes = self.transcripts.keys() + currentChromosome = 0 + currentTranscript = 0 + while True: + if currentChromosome >= len(chromosomes): + return + elif currentTranscript >= len(self.transcripts[chromosomes[currentChromosome]]): + currentTranscript = 0 + currentChromosome += 1 + elif self.transcripts[chromosomes[currentChromosome]][currentTranscript] == None: + currentTranscript += 1 + else: + yield self.transcripts[chromosomes[currentChromosome]][currentTranscript] + currentTranscript += 1 + + + def __str__(self): + string = "" + for transcript in self.getIterator(): + string += str(transcript) + return string + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/TranscriptListIterator.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptListIterator.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,58 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class TranscriptListIterator(object): + """A class that iterates on a list of transcript""" + + def __init__(self, transcriptList, verbosity = 0): + self.transcriptList = transcriptList + self.verbosity = verbosity + self.chromosomes = self.transcriptList.transcripts.keys() + self.currentChromosome = 0 + self.currentTranscript = -1 + + + def __iter__(self): + return self + + + def next(self): + self.currentTranscript += 1 + while True: + if self.currentChromosome >= len(self.transcriptList.transcripts): + raise StopIteration + elif self.currentTranscript >= len(self.transcriptList.transcripts[self.chromosomes[self.currentChromosome]]): + self.currentTranscript = 0 + self.currentChromosome += 1 + elif self.transcriptList.transcripts[self.chromosomes[self.currentChromosome]][self.currentTranscript] == None: + self.currentTranscript += 1 + else: + return self.transcriptList.transcripts[self.chromosomes[self.currentChromosome]][self.currentTranscript] + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/TranscriptListsComparator.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptListsComparator.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,1198 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import random +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + + + +class TranscriptListsComparator(object): + """ + Compare two transcript lists, using a database for one of the list + Uses one TranscriptContainer for query data, + one TranscriptContainer exported to MySqlTranscriptTable for reference data, + one MySqlTranscriptTable for transformed reference data + @ivar inputTranscriptContainers: parsers to the list of query transcripts + @type inputTranscriptContainers: list of 2 L{TranscriptContainer} + @ivar writer: transcript list writer + @type writer: class L{TranscriptListWriter} + @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently) + @type mySqlConnection: class L{MySqlConnection} + @ivar introns: compare transcripts or exons only + @type introns: list of 2 boolean + @ivar starts: restrict the query transcripts to first nucleotides + @type starts: list of 2 int or None + @ivar fivePrimes: extend a list of transcripts by their 5' end + @type fivePrimes: list of 2 int or None + @ivar threePrimes: extend a list of transcripts by their 3' end + @type threePrimes: list of 2 int or None + @ivar minDistance: min distance between two transcripts [default: 0] + @type minDistance: int + @ivar maxDistance: max distance between two transcripts [default: 0] + @type maxDistance: int + @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap + @type minOverlap: int + @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap + @type pcOverlap: int + @ivar upstreams: consider distances with elements which are upstream of the transcripts + @type upstreams: boolean + @ivar downstreams: consider distances with elements which are downstream of the transcripts + @type downstreams: boolean + @ivar colinear: whether transcripts should overlap in the same direction + @type colinear: boolean + @ivar antisense: whether transcripts should overlap in the opposite direction + @type antisense: boolean + @ivar outputDistance: output distance between query and reference instead of query transcript + @type outputDistance: boolean + @ivar absolute: do not consider the strand while computing distance + @type absolute: boolean + @ivar strandedDistance: return a line per strand while computing distances + @type strandedDistance: boolean + @ivar QUERY: constant specifying the query objects + @type QUERY: int + @ivar REFERENCE: constant specifying the reference objects + @type REFERENCE: int + @ivar INPUTTYPES: set of input types of data (query or reference) objects + @type INPUTTYPES: list of 2 int + @ivar typeToString: string representation of the previous types + @type typeToString: dict + @ivar tableNames: name of the transcript tables + @type tableNames: dict of strings + @ivar nbTranscripts: number of transcript in the query/reference set + @type nbTranscripts: list of 2 int or None + @ivar nbNucleotides: number of nucleotides in the query/reference set + @type nbNucleotides: list of 2 int or None + @ivar transcriptsToBeStored: transcripts that will be stored into database + @type transcriptsToBeStored: dict of class L{TranscriptList} + @ivar multiple: in merge mode, aggregate multiple transcripts + @type multiple: boolean + @ivar normalization: normalize each element by the number of mappings of this element + @type normalization: boolean + @ivar invert: invert the current comparison + @type invert: boolean + @ivar splitDifference: split into intervals when computing difference + @type splitDifference: boolean + @ivar odds: whether odds about the comparison should be computed + @type odds: boolean + @ivar overlapResults: count the number of overlaps + @type overlapResults: dictionary + @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one + @type oddResults: dictionary + @ivar outputContainer: container of the output transcripts + @type outputContainer: class L{TranscriptContainer} + @ivar logHandle: log handle + @type logHandle: file + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, logHandle = None, verbosity = 0): + """ + Constructor + @param transcriptListParser2: parser to the list of reference transcripts + @type transcriptListParser2: class L{TranscriptListParser} + @param logHandle: log handle + @type logHandle: file + @param verbosity: verbosity + @type verbosity: int + """ + self.QUERY = 0 + self.REFERENCE = 1 + self.WORKING = 2 + self.INPUTTYPES = (self.QUERY, self.REFERENCE) + self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING) + self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"} + + self.logHandle = logHandle + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(self.verbosity-1) + self.inputTranscriptContainers = [None, None] + self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))] + self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames] + self.writer = None + self.introns = [False, False] + self.starts = [None, None] + self.ends = [None, None] + self.fivePrimes = [None, None] + self.threePrimes = [None, None] + self.minDistance = None + self.maxDistance = 0 + self.minOverlap = 1 + self.pcOverlap = None + self.colinear = False + self.antisense = False + self.downstreams = [False, False] + self.upstreams = [False, False] + self.outputDistance = False + self.absolute = False + self.strandedDistance = False + self.nbTranscripts = [None, None] + self.nbNucleotides = [None, None] + self.normalization = False + self.included = False + self.including = False + self.invert = False + self.notOverlapping = False + self.splitDifference = False + self.multiple = False + self.odds = False + self.overlapResults = None + self.oddResults = None + self.outputContainer = None + self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES]) + self.nbPrinted = 0 + + self.mySqlConnection.createDatabase() + + + def __del__(self): + """ + Destructor + Remove all temporary tables + """ + for type in self.INPUTWORKINGTYPES: + self.mySqlTranscriptWriters[type].removeTables() + self.mySqlConnection.deleteDatabase() + + def acceptIntrons(self, type, bool): + """ + Compare transcripts or exons only + @param type: whether use query/reference data + @type type: int + @param bool: include introns or not + @type bool: boolean + """ + self.introns[type] = bool + + + def restrictToStart(self, type, size): + """ + Restrict a list of transcripts to first nucleotides + @param type: whether use query/reference data + @type type: int + @param size: the size of the transcript to be considered + @type size: int + """ + self.starts[type] = size + self.introns[type] = False + + + def restrictToEnd(self, type, size): + """ + Restrict a list of transcripts to first nucleotides + @param type: whether use query/reference data + @type type: int + @param size: the size of the transcript to be considered + @type size: int + """ + self.ends[type] = size + self.introns[type] = False + + + def extendFivePrime(self, type, size): + """ + Extend a list of transcripts by their 5' end + @param type: whether use query/reference data + @type type: int + @param size: size of the extension + @type size: int + """ + self.fivePrimes[type] = size + + + def extendThreePrime(self, type, size): + """ + Extend the list of query transcripts by their 3' end + @param type: whether use query/reference data + @type type: int + @param size: size of the extension + @type size: int + """ + self.threePrimes[type] = size + + + def setMinDistance(self, distance): + """ + Set the min distance between two transcripts + @param distance: distance + @type distance: int + """ + self.minDistance = distance + + + def setMaxDistance(self, distance): + """ + Set the max distance between two transcripts + @param distance: distance + @type distance: int + """ + self.maxDistance = distance + + + def setMinOverlap(self, overlap): + """ + Set the minimum number of nucleotides to declare an overlap + @param overlap: minimum number of nucleotides + @type overlap: int + """ + self.minOverlap = overlap + + + def setPcOverlap(self, overlap): + """ + Set the percentage of nucleotides to declare an overlap + @param overlap: percentage of nucleotides + @type overlap: int + """ + self.pcOverlap = overlap + + + def setUpstream(self, type, boolean): + """ + Consider transcripts which are upstream of some transcripts + @param type: whether use query/reference data + @type type: int + @param boolean: consider only these transcripts or not + @type boolean: boolean + """ + self.upstreams[type] = boolean + + + def setDownstream(self, type, boolean): + """ + Consider transcripts which are downstream of some transcripts + @param type: whether use query/reference data + @type type: int + @param boolean: consider only these transcripts or not + @type boolean: boolean + """ + self.downstreams[type] = boolean + + + def setOutputDistance(self, boolean): + """ + Output distance between query and reference instead of query transcript + @param boolean: whether distance should be output + @type boolean: boolean + """ + self.outputDistance = boolean + + + def setAbsolute(self, boolean): + """ + Do not consider strand when computing distance (thus, having only non-negative values) + @param boolean: whether we should consider strands + @type boolean: boolean + """ + self.absolute = boolean + + + def setStrandedDistance(self, boolean): + """ + Return two distance distributions, one per strand + @param boolean: whether we should return 2 distance distance + @type boolean: boolean + """ + self.strandedDistance = boolean + + + def getColinearOnly(self, boolean): + """ + Only consider transcripts that overlap in the same direction + @param boolean: whether transcripts should overlap in the same direction + @type boolean: boolean + """ + self.colinear = boolean + + + def getAntisenseOnly(self, boolean): + """ + Only consider transcripts that overlap in the opposite direction + @param boolean: whether transcripts should overlap in the opposite direction + @type boolean: boolean + """ + self.antisense = boolean + + + def setIncludedOnly(self, boolean): + """ + Keep the elements from first set which are included in the second set + @param boolean: whether to keep included elements only + @type boolean: boolean + """ + self.included = boolean + + + def setIncludingOnly(self, boolean): + """ + Keep the elements from second set which are included in the first set + @param boolean: whether to keep included elements only + @type boolean: boolean + """ + self.including = boolean + + + def setNormalization(self, boolean): + """ + Normalize the elements by the number of mappings in the genome + @param boolean: whether normalize + @type boolean: boolean + """ + self.normalization = boolean + + + def getInvert(self, boolean): + """ + Only consider transcripts that do not overlap + @param boolean: whether invert the selection + @type boolean: boolean + """ + self.invert = boolean + + + def includeNotOverlapping(self, boolean): + """ + Also output the elements which do not overlap + @param boolean: whether output the elements which do not overlap + @type boolean: boolean + """ + self.notOverlapping = boolean + + + def setSplitDifference(self, boolean): + """ + Split into intervals when computing difference + @param boolean: whether to split + @type boolean: boolean + """ + self.splitDifference = boolean + + + def aggregate(self, boolean): + """ + In merge mode, aggregate multiple transcripts + @param boolean: aggregate multiple transcripts + @type boolean: boolean + """ + self.multiple = boolean + + + def getTables(self, type): + """ + Get the SQL tables + @param type: type of the table (query, reference, etc.) + @type type: int + """ + return self.mySqlTranscriptWriters[type].getTables() + + + def computeOdds(self, boolean): + """ + Compute odds + @param boolean: whether odds should be computed + @type boolean: boolean + """ + self.odds = boolean + if self.odds: + self.overlapResults = dict() + + + def computeOddsPerTranscript(self, boolean): + """ + Compute odds for each transcript + @param boolean: whether odds for each transcript should be computed + @type boolean: boolean + """ + self.odds = boolean + if self.odds: + self.overlapResults = dict() + + + def removeTables(self): + """ + Remove the temporary MySQL tables + """ + for type in self.INPUTWORKINGTYPES: + for chromosome in self.getTables(type): + self.getTables(type)[chromosome].remove() + + + def clearTables(self): + """ + Empty the content of the databases + """ + for type in self.INPUTWORKINGTYPES: + if self.transcriptListParsers[type] != None: + for chromosome in self.getTables(type): + self.getTables(type)[chromosome].clear() + + + def extendTranscript(self, type, transcript): + """ + Extend a transcript corresponding to the parameters of the class + @param transcript: a transcript + @type transcript: class L{Transcript} + @return: the possibly extended transcript + """ + extendedTranscript = Transcript() + extendedTranscript.copy(transcript) + if self.starts[type] != None: + extendedTranscript.restrictStart(self.starts[type]) + if self.ends[type] != None: + extendedTranscript.restrictEnd(self.ends[type]) + if self.fivePrimes[type] != None: + extendedTranscript.extendStart(self.fivePrimes[type]) + if self.threePrimes[type] != None: + extendedTranscript.extendEnd(self.threePrimes[type]) + return extendedTranscript + + + def storeTranscript(self, type, transcript, now = True): + """ + Add a transcript to a MySQL database, or postpone the store + @param type: whether use query/reference table + @type type: int + @param transcript: a transcript + @type transcript: class L{Transcript} + @param now: whether transcript should be stored now (or stored can be postponed) + @type now: bool + """ + self.mySqlTranscriptWriters[type].addTranscript(transcript) + if type == self.REFERENCE: + self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript) + if now: + self.mySqlTranscriptWriters[type].write() + if type == self.REFERENCE: + self.mySqlTranscriptWriters[self.WORKING].write() + + + def writeTranscript(self, transcript): + """ + Write a transcript in the output file + @param transcript: a transcript + @type transcript: class L{Transcript} + """ + if self.writer != None: + self.writer.addTranscript(transcript) + self.nbPrinted += 1 + + + def flushData(self, type = None): + """ + Store the remaining transcripts + @param type: whether use query/reference table (None for all) + @type type: int or None + """ + if type == None: + types = self.INPUTWORKINGTYPES + else: + types = [type] + for type in types: + self.mySqlTranscriptWriters[type].write() + if self.writer != None: + self.writer.write() + + + def unstoreTranscript(self, type, transcript): + """ + Remove a transcript from a MySQL database + @param type: whether use query/reference table + @type type: int + @param transcript: a transcript + @type transcript: class L{Transcript} + """ + self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript) + if type == self.REFERENCE: + self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript) + + + def addIndexes(self, tables): + """ + Add useful indexes to the tables + @param tables: which tables should be indexed + @type tables: list of int + """ + for type in tables: + for chromosome in self.getTables(type): + self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"]) + self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"]) + + + def storeTranscriptList(self, type, transcriptListParser, extension): + """ + Store a transcript list into database + @param type: whether use query/reference parser + @type type: int + @param parser: a parser of transcript list + @type parser: class L{TranscriptContainer} + @param extension: extend (or not) the transcripts + @type extension: boolean + """ + progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1) + for transcript in transcriptListParser.getIterator(): + if extension: + transcript = self.extendTranscript(type, transcript) + self.mySqlTranscriptWriters[type].addTranscript(transcript) + progress.inc() + self.mySqlTranscriptWriters[type].write() + progress.done() + if type == self.REFERENCE: + for chromosome in self.getTables(self.REFERENCE): + self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1) + self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome]) + + + def setInputTranscriptContainer(self, type, inputTranscriptContainer): + """ + Set an input transcript list container + @param type: whether use query/reference parser + @type type: int + @param inputTranscriptContainer: a container + @type inputTranscriptContainer: class L{TranscriptContainer} + """ + self.inputTranscriptContainers[type] = inputTranscriptContainer + self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts() + self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides() + + + def setOutputWriter(self, writer): + """ + Set an output transcript list writer + @param writer: a writer + @type writer: class L{TranscriptListWriter} + """ + self.writer = writer + + + def compareTranscript(self, transcript1, transcript2, includeDistance = False): + """ + Compare two transcripts, using user defined parameters + @param transcript1: a transcript from the query set (already extended) + @type transcript1: class L{Transcript} + @param transcript2: a transcript from the reference set (already extended) + @type transcript2: class L{Transcript} + @param includeDistance: take into account the distance too + @type includeDistance: boolean + @return: true, if they overlap + """ + extendedTranscript1 = Transcript() + extendedTranscript1.copy(transcript1) + if includeDistance: + if self.maxDistance > 0: + extendedTranscript1.extendStart(self.maxDistance) + extendedTranscript1.extendEnd(self.maxDistance) + + minOverlap = self.minOverlap + if self.pcOverlap != None: + minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap) + if not extendedTranscript1.overlapWith(transcript2, self.minOverlap): + return False + if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \ + (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \ + (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \ + (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()): + return False + if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()): + return False + if self.included and not transcript2.include(extendedTranscript1): + return False + if self.including and not extendedTranscript1.include(transcript2): + return False + if self.introns[self.REFERENCE] and self.introns[self.QUERY]: + if self.logHandle != None: + self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2))) + return True + if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap): + if self.logHandle != None: + self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2))) + return True + return False + + + def compareTranscriptToList(self, transcript1): + """ + Compare a transcript to the reference list of transcripts + (Do not extend the transcripts, except for the distance) + @param transcript1: a transcript (from the query set) + @type transcript1: class L{Transcript} + @return: the reference transcripts overlapping + """ + # no transcript in the reference table + if transcript1.getChromosome() not in self.getTables(self.WORKING): + return + + # retrieve the the transcripts that may overlap in the working tables + clauses = [] + extendedTranscript1 = Transcript() + extendedTranscript1.copy(transcript1) + if self.maxDistance > 0: + extendedTranscript1.extendStart(self.maxDistance) + if self.maxDistance > 0: + extendedTranscript1.extendEnd(self.maxDistance) + command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName()) + for binPair in extendedTranscript1.getBins(): + clause = "bin " + if binPair[0] == binPair[1]: + clause += "= %i" % (binPair[0]) + else: + clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1]) + clauses.append(clause) + command += " OR ".join(clauses) + command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart()) + + for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command): + if self.compareTranscript(extendedTranscript1, transcript2): + yield transcript2 + + + def compareTranscriptList(self): + """ + Compare a list of transcript to the reference one + @return: the transcripts that overlap with the reference set + """ + distance = 0 + nbClustersIn = 0 + nbClustersOut = 0 + if self.maxDistance != None: + distance = self.maxDistance + + self.addIndexes([self.QUERY, self.REFERENCE]) + + # export the container into tables + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) + + # looping + for chromosome1 in sorted(self.getTables(self.QUERY).keys()): + # get range of transcripts + command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) + query = self.mySqlConnection.executeQuery(command) + result = query.getLine() + first = result[0] + last = result[1] + nb = result[2] + + transcripts1 = [] + toBeRemoved1 = [] + transcripts2 = [] + toBeRemoved2 = [] + overlapsWith = [] + nbOverlaps = [] + nbChunks = max(1, nb / 100) + chunkSize = (last - first) / nbChunks + progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) + for chunk in range(nbChunks + 1): + + # load transcripts + start = first + chunk * chunkSize + end = start + chunkSize - 1 + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) + for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): + transcripts1.append(transcript1) + overlapsWith.append([]) + nbOverlaps.append(0) + nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) + self.mySqlConnection.executeQuery(command) + if chromosome1 in self.getTables(self.REFERENCE): + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) + if chunk == 0: + command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): + transcripts2.append(transcript2) + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + self.mySqlConnection.executeQuery(command) + + # compare sets + for index1, transcript1 in enumerate(transcripts1): + overlappingNames = [] + nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") + for transcript2 in transcripts2: + if self.compareTranscript(transcript1, transcript2, True): + id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() + if id2 not in overlapsWith[index1]: + overlapsWith[index1].append(id2) + nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements") + if self.odds: + if transcript2.getName() not in self.overlapResults: + self.overlapResults[transcript2.getName()] = 0 + self.overlapResults[transcript2.getName()] += nbElements1 + + # check if query transcript extends bounds of the chunk + if transcript1.getEnd() < end: + if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: + if overlapsWith[index1]: + transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) + transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) + elif self.notOverlapping: + transcript1.setTagValue("nbOverlaps", "0") + self.writeTranscript(transcript1) + nbClustersOut += nbElements1 + toBeRemoved1.append(index1) + + # update list of query transcripts + for index1 in reversed(toBeRemoved1): + del transcripts1[index1] + del overlapsWith[index1] + del nbOverlaps[index1] + toBeRemoved1 = [] + + # check if the reference transcripts extends bounds of the chunk + for index2, transcript2 in enumerate(transcripts2): + if transcript2.getEnd() + distance < end: + toBeRemoved2.append(index2) + for index2 in reversed(toBeRemoved2): + del transcripts2[index2] + toBeRemoved2 = [] + + progress.inc() + + for index1, transcript1 in enumerate(transcripts1): + if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: + if overlapsWith[index1]: + transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) + transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) + elif self.notOverlapping: + transcript1.setTagValue("nbOverlaps", "0") + self.writeTranscript(transcript1) + nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") + progress.done() + self.getTables(self.QUERY)[chromosome1].remove() + if chromosome1 in self.getTables(self.REFERENCE): + self.getTables(self.REFERENCE)[chromosome1].remove() + self.getTables(self.WORKING)[chromosome1].remove() + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) + print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn) + if self.nbTranscripts[self.QUERY] != 0: + print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100), + if nbClustersOut != 0: + print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100) + + + def compareTranscriptListDistance(self): + """ + Compare a list of transcript to the reference one + @return: the distance distributions in a hash + """ + nbDistances = 0 + distances = {} + absDistances = {} + strandedDistances = dict([(strand, {}) for strand in (1, -1)]) + + # export the container into tables + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) + + progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1) + for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator(): + # get the distance + transcript1 = self.extendTranscript(self.QUERY, transcript1) + distance = self.maxDistance + 1 + strand = None + closestElement = "None" + for transcript2 in self.compareTranscriptToList(transcript1): + thisStrand = transcript1.getDirection() * transcript2.getDirection() + if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()): + transcript2.reverse() + if self.absolute: + transcript2.setDirection(transcript1.getDirection()) + if transcript2.getDirection() == transcript1.getDirection(): + if self.starts[self.REFERENCE] != None: + transcript2.restrictStart(self.starts[self.REFERENCE]) + if self.ends[self.REFERENCE] != None: + transcript2.restrictEnd(self.ends[self.REFERENCE]) + thisDistance = transcript1.getRelativeDistance(transcript2) + if (self.absolute): + thisDistance = abs(thisDistance) + if abs(thisDistance) < abs(distance): + distance = thisDistance + strand = thisStrand + closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() + if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance): + nbDistances += 1 + distances[distance] = distances.get(distance, 0) + 1 + absDistance = abs(distance) + absDistances[absDistance] = absDistances.get(absDistance, 0) + 1 + strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0) + if distance not in strandedDistances[-strand]: + strandedDistances[-strand][distance] = 0 + + # write transcript + if distance == self.maxDistance + 1: + distance = "None" + tmpTranscript = Transcript() + tmpTranscript.copy(transcript1) + tmpTranscript.setTagValue("distance", distance) + tmpTranscript.setTagValue("closestElement", closestElement) + self.writeTranscript(tmpTranscript) + + progress.inc() + progress.done() + + self.flushData() + + if self.verbosity > 0: + print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE]) + print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) + if nbDistances == 0: + print "Nothing matches" + else: + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances) + print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100) + + if self.strandedDistance: + return strandedDistances + return distances + + + def compareTranscriptListMerge(self): + """ + Merge the query list of transcript with itself + @return: the merged transcripts in a transcript list database + """ + nbMerges = 0 + + for type in (self.QUERY, self.REFERENCE): + self.storeTranscriptList(type, self.inputTranscriptContainers[type], True) + self.flushData() + + # Loop on the chromosomes + for chromosome in sorted(self.getTables(self.QUERY).keys()): + if chromosome not in self.getTables(self.REFERENCE): + continue + + # Get the size of the chromosome + maxEnd = 0 + nbChunks = 0 + for type in (self.QUERY, self.REFERENCE): + command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName()) + query = self.mySqlConnection.executeQuery(command) + maxEnd = max(maxEnd, int(query.getLine()[0])) + nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements()) + + mergedTranscripts = {} + transcripts = {self.QUERY: [], self.REFERENCE: []} + progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1) + for i in range(nbChunks): + rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1 + rangeEnd = int((i+1) * (float(maxEnd) / nbChunks)) + + # Get all transcripts in query and reference from chunk + for type in (self.QUERY, self.REFERENCE): + correction = 0 if self.QUERY else self.maxDistance + command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction) + for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command): + transcripts[type].append(transcript) + + # Merge elements between the two samples + for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]): + for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]): + if referenceTranscript == None: continue + if self.compareTranscript(queryTranscript, referenceTranscript, True): + if queryTranscript.getDirection() != referenceTranscript.getDirection(): + referenceTranscript.setDirection(queryTranscript.getDirection()) + queryTranscript.merge(referenceTranscript, self.normalization) + nbMerges += 1 + transcripts[self.REFERENCE][iReference] = None + if not self.multiple: + mergedTranscripts[iQuery] = 0 + + # Remove transcripts from database + for type in (self.QUERY, self.REFERENCE): + correction = 0 if self.QUERY else self.maxDistance + command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction) + query = self.mySqlConnection.executeQuery(command) + + # Just in case, self-merge the elements in the query (beware of mergedTranscripts!) + if (self.multiple): + for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]): + if queryTranscript1 == None: continue + for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]): + if iQuery2 <= iQuery1 or queryTranscript2 == None: continue + minOverlap = self.minOverlap + if self.pcOverlap != None: + minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap) + if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear): + if queryTranscript1.getDirection() != queryTranscript2.getDirection(): + queryTranscript2.setDirection(queryTranscript1.getDirection()) + queryTranscript1.merge(queryTranscript2, self.normalization) + transcripts[self.QUERY][iQuery2] = None + nbMerges += 1 + if not self.multiple: + mergedTranscripts[iQuery1] = 0 + + # Update the sets of transcripts and write into database (also update mergedTranscripts) + newTranscripts = {self.QUERY: [], self.REFERENCE: []} + newMergedTranscripts = {} + for type in (self.QUERY, self.REFERENCE): + for i, transcript in enumerate(transcripts[type]): + if transcript == None: continue + correction = 0 if self.QUERY else self.maxDistance + if transcript.getEnd() < rangeEnd - correction: + if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): + self.writeTranscript(transcripts[type][i]) + else: + if type == self.QUERY and i in mergedTranscripts: + newMergedTranscripts[len(newTranscripts[type])] = 0 + newTranscripts[type].append(transcript) + transcripts = newTranscripts + mergedTranscripts = newMergedTranscripts + + progress.inc() + progress.done() + + for type in (self.QUERY, self.REFERENCE): + for i, transcript in enumerate(transcripts[type]): + if transcripts == None: continue + if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): + self.writeTranscript(transcripts[type][i]) + + # Manage chromosomes with no corresponding data + if self.multiple: + for type in self.INPUTTYPES: + for chromosome in self.getTables(type): + if chromosome in self.getTables(1 - type): + continue + for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator(): + self.writeTranscript(transcript) + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) + print "# merges: %d" % (nbMerges) + print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) + + + def compareTranscriptListSelfMerge(self): + """ + Merge the query list of transcript with itself + @return: the merged transcripts in a transcript list database + """ + nbMerges = 0 + distance = self.maxDistance if self.maxDistance != None else 0 + + self.addIndexes([self.QUERY]) + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.flushData() + + # looping + for chromosome1 in sorted(self.getTables(self.QUERY).keys()): + transcripts2 = [] + + # get range of transcripts + progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) + command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName()) + for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): + + # compare sets + toBeRemoved = set() + toBePrinted = set() + for index2, transcript2 in enumerate(transcripts2): + + if self.compareTranscript(transcript1, transcript2, True): + if transcript1.getDirection() != transcript2.getDirection(): + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2, self.normalization) + toBeRemoved.add(index2) + nbMerges += 1 + elif transcript2.getEnd() + distance < transcript1.getStart(): + toBePrinted.add(index2) + transcripts2.append(transcript1) + + for index2 in sorted(toBePrinted): + self.writeTranscript(transcripts2[index2]) + transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)] + + for transcript2 in transcripts2: + self.writeTranscript(transcript2) + progress.done() + self.getTables(self.QUERY)[chromosome1].remove() + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) + print "# merges: %d" % (nbMerges) + print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) + + + def getDifferenceTranscriptList(self): + """ + Get the elements of the first list which do not overlap the second list (at the nucleotide level) + @return: the transcripts that overlap with the reference set + """ + distance = 0 if self.maxDistance == None else self.maxDistance + + self.addIndexes([self.QUERY, self.REFERENCE]) + + # export the container into tables + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) + + # looping + for chromosome1 in sorted(self.getTables(self.QUERY).keys()): + # get range of transcripts + command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) + query = self.mySqlConnection.executeQuery(command) + result = query.getLine() + first = result[0] + last = result[1] + nb = result[2] + + transcripts1 = [] + transcripts2 = [] + nbChunks = max(1, nb / 100) + chunkSize = (last - first) / nbChunks + progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) + for chunk in range(nbChunks + 1): + + # load transcripts + start = first + chunk * chunkSize + end = start + chunkSize - 1 + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) + for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): + transcripts1.append(transcript1) + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) + self.mySqlConnection.executeQuery(command) + if chromosome1 in self.getTables(self.REFERENCE): + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) + if chunk == 0: + command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): + transcripts2.append(transcript2) + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + self.mySqlConnection.executeQuery(command) + + # compare sets + toBeRemoved1 = [] + for index1, transcript1 in enumerate(transcripts1): + newTranscript1 = Transcript() + newTranscript1.copy(transcript1) + for transcript2 in transcripts2: + newTranscript1 = newTranscript1.getDifference(transcript2) + if newTranscript1 == None: + toBeRemoved1.append(index1) + break + transcripts1[index1] = newTranscript1 + + # check if query transcript extends bounds of the chunk + if newTranscript1 != None and newTranscript1.getEnd() < end: + if self.splitDifference: + for exon in newTranscript1.getExons(): + transcript = Transcript() + transcript.copy(exon) + self.writeTranscript(transcript) + else: + self.writeTranscript(newTranscript1) + toBeRemoved1.append(index1) + + # update list of query transcripts + for index1 in reversed(toBeRemoved1): + del transcripts1[index1] + + # check if the reference transcripts extends bounds of the chunk + toBeRemoved2 = [] + for index2, transcript2 in enumerate(transcripts2): + if transcript2.getEnd() + distance < end: + toBeRemoved2.append(index2) + for index2 in reversed(toBeRemoved2): + del transcripts2[index2] + + progress.inc() + + for transcript1 in transcripts1: + if self.splitDifference: + for exon in transcript1.getExons(): + transcript = Transcript() + transcript.copy(exon) + self.writeTranscript(transcript) + else: + self.writeTranscript(transcript1) + progress.done() + self.getTables(self.QUERY)[chromosome1].remove() + if chromosome1 in self.getTables(self.REFERENCE): + self.getTables(self.REFERENCE)[chromosome1].remove() + self.getTables(self.WORKING)[chromosome1].remove() + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "query: %d elements" % (self.nbTranscripts[self.QUERY]) + print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) + print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) + + + def getOddsPerTranscript(self): + """ + Return overlap results + @return a dict of data + """ + if not self.odds: + raise Exception("Did not compute odds!") + return self.overlapResults + + + def getOdds(self): + """ + Return odds about the overlap + @return a dict of data + """ + if not self.odds: + raise Exception("Did not compute odds!") + if self.oddResults != None: + return self.oddResults + self.oddResults = {} + for name, value in self.overlapResults.iteritems(): + self.oddResults[value] = self.oddResults.get(value, 0) + 1 + return self.oddResults diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/structure/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/testInstall.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/testInstall.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,103 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Test if the configuration is sound +""" + +import sys +import os +import subprocess + +# Test Python files +try : + from SMART.Java.Python.misc.RPlotter import * +except: + print "Cannot find Python scripts! Update PYTHONPATH (currently %s) environment variable and see configuration in the documentation!" % (os.environ["PYTHONPATH"] if "PYTHONPATH" in os.environ else "empty") + sys.exit(3) + +try : + from SMART.Java.Python.mySql.MySqlTranscriptTable import * + from SMART.Java.Python.mySql.MySqlConnection import * +except: + print "SQLite is not installed ! Please read the documentation!" + sys.exit(4) + + +if __name__ == "__main__": + + print "Python scripts are correctly read." + + # Test mySQL + connection = MySqlConnection() + table = MySqlTranscriptTable(connection) + + try: + table.createTranscriptTable() + except: + print "Cannot connect to the SQLite database! See configuration in the documentation!" + sys.exit(5) + + print "SQLite database is correctly set up." + + + # Test R + fileName = "tmpFile.R" + file = open(fileName, "w") + file.write("?licence\n") + file.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, fileName) + status = subprocess.call(command, shell=True) + os.remove(fileName) + outputFileName = "%sout" % (fileName) + if os.path.exists(outputFileName): + os.remove(outputFileName) + + if status != 0: + print "Problem with the execution of R script (command '%s' did not work, current directory is %s, status is %d)! See configuration in the documentation!" % (command, os.getcwd(), status) + sys.exit(6) + + line = {0: 1, 1: 2} + pngFileName = "tmpFile.png" + plotter = RPlotter(pngFileName) + plotter.addLine(line) + try: + plotter.plot() + except: + print "Problem with the execution of R script: library 'RColorBrewer' is missing! See configuration in the documentation!" + sys.exit(7) + os.remove(pngFileName) + + print "R is available." + + print "Set up is fine! Enjoy S-MART!" diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/toolLauncher/RnaFoldLauncher.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/toolLauncher/RnaFoldLauncher.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,379 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import sys +import random +import subprocess +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from commons.core.parsing.FastaParser import FastaParser + + +class RnaFoldStructure(object): + """ + A structure to store the output of RNAFold + @ivar name: the name of the sequence + @type name: string + @ivar sequence: the sequence (with gaps) + @type sequence: string + @ivar structure: the bracket structure + @type structure: string + @ivar energy: the energy of the fold + @type energy: float + @ivar interactions: the interactions inside the structure + @ivar interactions: the interactions inside the structure + """ + + def __init__(self, name, sequence, structure, energy): + """ + Initialize the structure + @param name the name of the sequence + @type name: string + @param sequence: the sequence (with gaps) + @type sequence: string + @param structure: the bracket structure + @type structure: string + @param energy: the energy of the fold + @type energy: float + """ + self.name = name + self.sequence = sequence + self.structure = structure + self.energy = energy + self.interactions = None + + + def analyze(self): + """ + Analyze the output, assign the interactions + """ + if len(self.sequence) != len(self.structure): + sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure)) + stack = [] + self.interactions = [None for i in range(len(self.sequence))] + for i in range(len(self.sequence)): + if self.structure[i] == "(": + stack.append(i) + elif self.structure[i] == ")": + if not stack: + sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) + otherI = stack.pop() + self.interactions[i] = otherI + self.interactions[otherI] = i + if stack: + sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) + + + def getNbBulges(self, start = None, end = None): + """ + Get the number of insertions in a given range (in number of letters) + @param start: where to start the count + @type start: int + @param end: where to end the co + @type end: int + """ + if start == None: + start = 0 + if end == None: + end = len(self.sequence) + previousInt = None + nbBulges = 0 + inRegion = False + for i in range(len(self.sequence)): + if i == start: + inRegion = True + if i > end: + return nbBulges + if inRegion: + if self.interactions[i] == None: + nbBulges += 1 + elif previousInt != None and abs(self.interactions[i] - previousInt) != 1: + nbBulges += 1 + previousInt = self.interactions[i] + return nbBulges + + + def getStar(self, start = None, end = None): + """ + Get the supposed miRNA* + @param start: where to start the count + @type start: int + @param end: where to end the co + @type end: int + """ + if start == None: + start = 0 + if end == None: + end = len(self.sequence) + minStar = 1000000 + maxStar = 0 + inRegion = False + for i in range(len(self.sequence)): + if i == start: + inRegion = True + if i > end: + return (minStar, maxStar) + if inRegion: + if self.interactions[i] != None: + minStar = min(minStar, self.interactions[i]) + maxStar = max(maxStar, self.interactions[i]) + return (minStar, maxStar) + + + +class RnaFoldLauncher(object): + """ + Start and parse a RNAFold instance + @ivar inputTranscriptList: a list of transcripts + @type inputTranscriptList: class L{TranscriptList} + @ivar genomeFileParser: a parser to the genome file + @type genomeFileParser: class L{SequenceListParser} + @ivar bothStrands: whether folding is done on both strands + @type bothStrands: bool + @ivar fivePrimeExtension: extension towards the 5' end + @type fivePrimeExtension: int + @ivar threePrimeExtension: extension towards the 3' end + @type threePrimeExtension: int + @ivar inputTranscriptList: the input list of transcripts + @type inputTranscriptList: class L{TranscriptList} + @ivar outputTranscriptList: the output list of transcripts + @type outputTranscriptList: class L{TranscriptList} + @ivar tmpInputFileName: the name of the temporary input file for RNAFold + @type tmpInputFileName: string + @ivar tmpOutputFileName: the name of the temporary output file for RNAFold + @type tmpOutputFileName: string + @ivar verbosity: verbosity + @type verbosity: int [default: 0] + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.verbosity = verbosity + self.transcriptNames = [] + randomNumber = random.randint(0, 100000) + self.bothStrands = True + self.tmpInputFileName = "tmpInput_%d.fas" % (randomNumber) + self.tmpOutputFileName = "tmpOutput_%d.fas" % (randomNumber) + self.outputTranscriptList = None + self.fivePrimeExtension = 0 + self.threePrimeExtension = 0 + + + def __del__(self): + for file in (self.tmpInputFileName, self.tmpOutputFileName): + if os.path.isfile(file): + os.remove(file) + + + def setTranscriptList(self, inputTranscriptList): + """ + Set the list of transcripts + @ivar inputTranscriptList: a list of transcripts + @type inputTranscriptList: class L{TranscriptList} + """ + self.inputTranscriptList = inputTranscriptList + + + def setExtensions(self, fivePrime, threePrime): + """ + Set extension sizes + @ivar fivePrime: extension towards the 5' end + @type fivePrime: int + @ivar threePrime: extension towards the 3' end + @type threePrime: int + """ + self.fivePrimeExtension = fivePrime + self.threePrimeExtension = threePrime + + + def setNbStrands(self, nbStrands): + """ + Set number of strands + @ivar nbStrands: if 2, the work is done on both strands + @type nbStrands: int + """ + self.nbStrands = nbStrands + + + def setGenomeFile(self, fileName): + """ + Set the genome file + @ivar genomeFileName: the multi-FASTA file containing the genome + @type genomeFileName: a string + """ + self.genomeFileParser = FastaParser(fileName, self.verbosity) + + + def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension): + """ + Write the RNAFold input file, containing the sequence corresponding to the transcript + @ivar transcript: a transcript + @type transcript: class L{Transcript} + @ivar reverse: invert the extensions + @type reverse: bool + """ + transcriptCopy = Transcript(transcript) + transcriptCopy.removeExons() + if not reverse: + transcriptCopy.extendStart(fivePrimeExtension) + transcriptCopy.extendEnd(threePrimeExtension) + else: + transcriptCopy.extendStart(threePrimeExtension) + transcriptCopy.extendEnd(fivePrimeExtension) + sequence = transcriptCopy.extractSequence(self.genomeFileParser) + handle = open(self.tmpInputFileName, "w") + handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence())) + handle.close() + + + def startRnaFold(self): + """ + Start RNAFold + """ + command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName) + if self.verbosity > 100: + print "Launching command '%s'" % (command) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status)) + + + def parseRnaFoldOutput(self): + """ + Parse an output file of RNAFold + @return: an RnaFoldStructure + """ + lines = open(self.tmpOutputFileName).readlines() + if len(lines) != 3: + raise Exception("Problem in RNAfold output! '%s'" % (lines)) + name = lines[0].strip()[1:].split()[0] + sequence = lines[1].strip() + structure = lines[2].strip().split()[0] + energy = float(lines[2].strip().split(" ", 1)[1][1:-1].strip()) + if self.verbosity > 100: + print "Getting sequence %s, structure %s" % (sequence, structure) + return RnaFoldStructure(name, sequence, structure, energy) + + + def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension): + """ + Analyze the RNAFold + @ivar transcript: a transcript + @type transcript: class L{Transcript} + @ivar rnaFoldOutput: the output of RNAFold + @type rnaFoldOutput: class L{RnaFoldStructure} + @ivar reverse: invert the extensions + @type reverse: bool + @return: a t-uple of energy, number of insertions, number of bulges, strand + """ + rnaFoldOutput.analyze() + transcriptSize = transcript.end - transcript.start + 1 + start = fivePrimeExtension if not reverse else threePrimeExtension + end = start + transcriptSize + energy = rnaFoldOutput.energy + nbBulges = rnaFoldOutput.getNbBulges(start, end) + (minStar, maxStar) = rnaFoldOutput.getStar(start, end) + minStar += transcript.start - start + maxStar += transcript.start - start + if self.verbosity > 100: + print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+") + return (energy, nbBulges, minStar, maxStar, reverse) + + + def fold(self, transcript): + """ + Fold a transcript (in each strand) + @ivar transcript: a transcript + @type transcript: class L{Transcript} + @return: a t-uple of energy, number of insertions, number of bulges, strand + """ + results = [None] * self.nbStrands + strands = [False, True] if self.nbStrands == 2 else [False] + minNbBulges = 1000000 + for i, reverse in enumerate(strands): + self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension) + self.startRnaFold() + output = self.parseRnaFoldOutput() + results[i] = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension) + minNbBulges = min(minNbBulges, results[i][1]) + for result in results: + if result[1] == minNbBulges: + return result + return None + + + def refold(self, transcript): + """ + Fold a transcript, knowing where the miRNA starts and end + @ivar transcript: a transcript + @type transcript: class L{Transcript} + @return: the energy + """ + miStar = transcript.getTagValue("miRnaStar") + startMiStar = int(miStar.split("-")[0]) + endMiStart = int(miStar.split("-")[1]) + fivePrimeExtension = max(0, transcript.start - startMiStar) + 5 + threePrimeExtension = max(0, endMiStart - transcript.end) + 5 + self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension) + self.startRnaFold() + output = self.parseRnaFoldOutput() + result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension) + return result[0] + + + def computeResults(self): + """ + Fold all and fill an output transcript list with the values + """ + progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity) + self.outputTranscriptList = TranscriptList() + for transcript in self.inputTranscriptList.getIterator(): + result = self.fold(transcript) + transcript.setTagValue("nbBulges", result[1]) + transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3])) + transcript.setTagValue("miRNAstrand", result[4]) + transcript.setTagValue("energy", self.refold(transcript)) + self.outputTranscriptList.addTranscript(transcript) + progress.inc() + progress.done() + + + def getResults(self): + """ + Get an output transcript list with the values + """ + if self.outputTranscriptList == None: + self.computeResults() + return self.outputTranscriptList diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/toolLauncher/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/trimAdaptor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/trimAdaptor.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,107 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Trim the sequences from a 5' adaptor""" + +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Trim Adaptor v1.0.1: Remove the 3' adaptor of a list of reads. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in sequence format given by -f]") + parser.add_option("-a", "--adaptor", dest="adaptor", action="store", type="string", help="adaptor [compulsory] [format: string]") + parser.add_option("-e", "--errors", dest="errors", action="store", default=0, type="int" , help="number of errors in percent [format: int] [default: 0]") + parser.add_option("-n", "--noAdaptor", dest="noAdaptor", action="store", default=None, type="string", help="file name where to print sequences with no adaptor [format: output file in sequence format given by -f]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + minSize = 2 + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + else: + sys.exit("Cannot handle files with '%s' format." % (options.format)) + + if options.format == "fasta": + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + sys.exit("Cannot handle files with '%s' format." % (options.format)) + + writerNoAdaptor = None + if options.noAdaptor != None: + if options.format == "fasta": + writerNoAdaptor = FastaWriter(options.noAdaptor, options.verbosity) + elif options.format == "fastq": + writerNoAdaptor = FastqWriter(options.noAdaptor, options.verbosity) + else: + sys.exit("Cannot handle files with '%s' format." % (options.format)) + + nbFound = 0 + + progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + progress.inc() + nucleotides = sequence.getSequence() + found = False + for i in range(len(nucleotides) - minSize): + nucleotidesPart = nucleotides[i:] + adaptorPart = options.adaptor if len(nucleotidesPart) >= len(options.adaptor) else options.adaptor[:len(nucleotidesPart)] + nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[:len(adaptorPart)] + if Utils.getHammingDistance(adaptorPart, nucleotidesPart) <= int(options.errors / 100.0 * len(adaptorPart)): + nbFound += 1 + sequence.shrinkToFirstNucleotides(i) + writer.addSequence(sequence) + found = True + break + if not found: + writer.addSequence(sequence) + if writerNoAdaptor != None: + writerNoAdaptor.addSequence(sequence) + progress.done() + + print "%d sequences with adaptors on %d (%.2f%%)" % (nbFound, parser.getNbSequences(), float(nbFound) / parser.getNbSequences() * 100) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/trimSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/trimSequence.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,102 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Remove sequences with low reliability""" + +from optparse import OptionParser +from commons.core.parsing.SequenceListParser import * +from commons.core.writer.FastaWriter import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Trim Sequences v1.0.1: Remove sequences with low reliability: low occurrences and highly repeted. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + parser = SequenceListParser(options.inputFileName, options.verbosity) + nbSequences = parser.getNbSequences() + progress = Progress(nbSequences, "Parsing file %s" % (options.inputFileName), options.verbosity) + + writer = FastaWriter(options.outputFileName, options.verbosity) + if options.log: + logHandle = open("log.txt", "w") + + letters = ("A", "C", "G", "T") + nbLowComplexity = 0 + nbTooManyOccurrences = 0 + + for sequence in parser.getIteractor(): + halfSize = len(sequence.sequence) / 2 + occurrences = set() + nbOccurrences = dict(zip(letters, [0 for letter in letters])) + tooManyOccurrences = False + good = True + + for char in sequence.sequence: + if char in letters: + occurrences.add(char) + nbOccurrences[char] += 1 + + + if len(occurrences) < 4: + nbLowComplexity += 1 + if options.log: + logHandle.write("Low complexity for %s\n" % (sequence.sequence)) + good = False + + if good: + for letter, nbOccurrence in nbOccurrences.iteritems(): + if nbOccurrence > halfSize: + if not tooManyOccurrences: + nbTooManyOccurrences += 1 + if options.log: + logHandle.write("Too many occurrences for %s\n" % (sequence.sequence)) + tooManyOccurrences = True + good = False + + if good: + writer.addSequence(sequence) + + progress.inc() + progress.done() + + if options.log: + logHandle.close() + + print "%d out of %d have low complexity (%f%%)" % (nbLowComplexity, nbSequences, (float(nbLowComplexity) / nbSequences * 100)) + print "%d out of %d have too many occurrences (%f%%)" % (nbTooManyOccurrences, nbSequences, (float(nbTooManyOccurrences) / nbSequences * 100)) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/trimSequences.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/trimSequences.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,149 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Trim Sequences v1.0.3: Remove the 5' and/or 3' adaptors of a list of reads. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in sequence format given by -f]") + parser.add_option("-3", "--threePAdaptor", dest="threePAdaptor", action="store", default=None, type="string", help="3' adaptor [format: string] [default: None]") + parser.add_option("-5", "--fivePAdaptor", dest="fivePAdaptor", action="store", default=None, type="string", help="5' adaptor [format: string] [default: None]") + parser.add_option("-e", "--errors", dest="errors", action="store", default=0, type="int", help="number of errors in percent [format: int] [default: 0]") + parser.add_option("-d", "--indels", dest="indels", action="store_true", default=False, help="also accept indels [format: bool] [default: False]") + parser.add_option("-n", "--noAdaptor5p", dest="noAdaptor5p", action="store", default=None, type="string", help="print sequences with no 5' adaptor [format: output file in sequence format given by -f]") + parser.add_option("-m", "--noAdaptor3p", dest="noAdaptor3p", action="store", default=None, type="string", help="print sequences with no 3' adaptor [format: output file in sequence format given by -f]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + minSize = 3 + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + else: + raise Exception("Cannot handle files with '%s' format." % (options.format)) + + if options.format == "fasta": + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + raise Exception("Cannot handle files with '%s' format." % (options.format)) + + + if options.noAdaptor5p != None: + if options.format == "fasta": + writer5pNoAdaptor = FastaWriter(options.noAdaptor5p, options.verbosity) + elif options.format == "fastq": + writer5pNoAdaptor = FastqWriter(options.noAdaptor5p, options.verbosity) + else: + raise Exception("Cannot handle files with '%s' format." % (options.format)) + nbFound5p = 0 + + if options.noAdaptor3p != None: + if options.format == "fasta": + writer3pNoAdaptor = FastaWriter(options.noAdaptor3p, options.verbosity) + elif options.format == "fastq": + writer3pNoAdaptor = FastqWriter(options.noAdaptor3p, options.verbosity) + else: + raise Exception("Cannot handle files with '%s' format." % (options.format)) + nbFound3p = 0 + + progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + progress.inc() + if options.threePAdaptor != None: + nucleotides = sequence.sequence + found = False + bestScore = 10000 + bestRegion = 0 + for i in range(len(nucleotides) - minSize): + nucleotidesPart = nucleotides[i:] + adaptorPart = options.threePAdaptor if len(nucleotidesPart) >= len(options.threePAdaptor) else options.threePAdaptor[:len(nucleotidesPart)] + nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[:len(adaptorPart)] + if options.indels: + score = Utils.getLevenshteinDistance(adaptorPart, nucleotidesPart) + else: + score = Utils.getHammingDistance(adaptorPart, nucleotidesPart) + if score <= int(options.errors / 100.0 * len(adaptorPart)) and score < bestScore: + bestScore = score + bestRegion = i + found = True + if found: + nbFound3p += 1 + sequence.shrinkToFirstNucleotides(bestRegion) + elif options.noAdaptor3p: + writer3pNoAdaptor.addSequence(sequence) + if options.fivePAdaptor != None: + nucleotides = sequence.sequence + found = False + bestScore = 10000 + bestRegion = 0 + for i in reversed(range(minSize, len(nucleotides))): + nucleotidesPart = nucleotides[:i] + adaptorPart = options.fivePAdaptor if len(nucleotidesPart) >= len(options.fivePAdaptor) else options.fivePAdaptor[-len(nucleotidesPart):] + nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[-len(adaptorPart):] + if options.indels: + score = Utils.getLevenshteinDistance(adaptorPart, nucleotidesPart) + else: + score = Utils.getHammingDistance(adaptorPart, nucleotidesPart) + if score <= int(options.errors / 100.0 * len(adaptorPart)) and score < bestScore: + bestScore = score + bestRegion = i + found = True + if found: + nbFound5p += 1 + sequence.shrinkToLastNucleotides(len(nucleotides) - bestRegion) + elif options.noAdaptor5p: + writer5pNoAdaptor.addSequence(sequence) + writer.addSequence(sequence) + progress.done() + writer.close() + + print "%d sequences" % (parser.getNbSequences()) + if options.fivePAdaptor != None: + print "%d sequences with 5' adaptors (%.2f%%)" % (nbFound5p, float(nbFound5p) / parser.getNbSequences() * 100) + if options.threePAdaptor != None: + print "%d sequences with 3' adaptors (%.2f%%)" % (nbFound3p, float(nbFound3p) / parser.getNbSequences() * 100) + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/txtToFasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/txtToFasta.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,63 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Transform a plain text file to a FASTA file""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.Sequence import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + parser = OptionParser() + description = "Txt to Fasta v1.0.1: Convert a Txt file (one sequence per line) into Fasta file. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in TXT format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + inputFile = open(options.inputFileName) + outputFile = open(options.outputFileName, "w") + + for line in inputFile: + line = line.strip() + splittedLine = line.split() + sequence = splittedLine[0] + nb = 1 if len(splittedLine) == 1 else int(splittedLine[1]) + for i in range(nb): + outputFile.write(">%s\n%s\n" % (sequence, sequence)) + + inputFile.close() + outputFile.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/updateQual.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/updateQual.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,86 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Update a .qual file given a .fasta file""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from SMART.Java.Python.misc.Progress import * + + +if __name__ == "__main__": + + # parse command line + description = "Update Qual v1.0.1: Remove the sequence in a Qual file which are not in the corresponding Fasta file. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-f", "--fasta", dest="fastaFile", action="store", type="string", help="fasta file [compulsory] [format: file in FASTA format]") + parser.add_option("-q", "--qual", dest="qualFile", action="store", type="string", help="qual file [compulsory] [format: file in QUAL format]") + parser.add_option("-o", "--output", dest="outputFile", action="store", type="string", help="output file [compulsory] [format: output file in QUAL format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = SequenceListParser(options.fastaFile, options.verbosity) + nbSequences = parser.getNbSequences() + progress = Progress(nbSequences, "Parsing file %s" % (options.fastaFile), options.verbosity) + qualHandle = open(options.qualFile) + outputHandle = open(options.outputFile, "w") + nbRefused = 0 + nbTotal = 0 + + names = [] + while parser.getNextSequence(): + sequence = parser.getCurrentSequence() + nbTotal += 1 + + found = False + name = None + for line in qualHandle: + line = line.strip() + if line[0] == ">": + name = line[1:] + if name == sequence.name: + found = True + else: + nbRefused += 1 + else: + if found: + outputHandle.write(">%s\n%s\n" % (name, line)) + found = False + name = None + break + progress.inc() + progress.done() + + + outputHandle.close() + qualHandle.close() + + print "%d out of %d are refused (%f%%)" % (nbRefused, nbTotal, (float(nbRefused) / nbTotal * 100)) diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/wigExploder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/wigExploder.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,99 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Explode wig files into several files, one for each chromosome""" + +import os, re, sys +from optparse import OptionParser + + +if __name__ == "__main__": + + # parse command line + description = "Wig Exploder v1.0.1: Explode a big WIG file into several smaller WIG files (one per chromosome). [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in WIG format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output directory [compulsory] [format: directory]") + parser.add_option("-s", "--strand", dest="strand", action="store", default=None, type="string", help="strand of the input WIG file (if any) [format: choice (+, -)]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + inputFile = open(options.inputFileName) + + files = {} + file = None + trackLine = None + strand = "" + if options.strand != None: + strand = options.strand + + for line in inputFile: + line = line.strip() + + if line.startswith("track"): + trackLine = line + continue + + m1 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=\d+\s+step=\d+\s*$", line) + m2 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=\d+\s+step=\d+\s+span=\d+\s*$", line) + m3 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) + m4 = re.search(r"^\s*variableStep\s+chrom=(\S+)span=\d+\s*$", line) + + m = None + if m1 != None: + m = m1 + elif m2 != None: + m = m2 + elif m3 != None: + m = m3 + elif m4 != None: + m = m4 + + if m != None: + chromosome = m.group(1) + + if chromosome in files: + file = files[chromosome] + else: + file = open("%s%s%s%s.wig" % (options.output, os.sep, chromosome, strand), "w") + files[chromosome] = file + if trackLine != None: + file.write("%s\n" % (trackLine)) + + if file == None: + sys.exit("Header is missing (current first line is '%s')! Aborting..." % (line)) + + file.write("%s\n" % (line)) + + inputFile.close() + + for chromosome in files: + files[chromosome].close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/Python/wrongFastqToQual.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/wrongFastqToQual.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,81 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a pseudo-FASTQ file to QUAL files""" + +import os +from optparse import OptionParser +from SMART.Java.Python.misc.Progress import * +from math import * + +if __name__ == "__main__": + + # parse command line + description = "Wrong FastQ to Qual v1.0.1: Convert a pseudo-FastQ (i.e. a FastQ file with a wrong format) into a Qual file. [Category: Personnal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTQ format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in QUAL format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + inputFile = open(options.inputFileName) + outputFastaFile = open("%s.fasta" % (options.outputFileName), "w") + outputQualFile = open("%s.qual" % (options.outputFileName), "w") + + inSequence = False + inQuality = True + sequenceName = None + for line in inputFile: + line = line.strip() + if line[0] == "@": + if inQuality == False: + sys.exit("Quality of %s is missing" % (sequenceName)) + inSequence = True + inQuality = False + sequenceName = line[1:] + outputFastaFile.write(">%s\n" % (sequenceName)) + elif line[0] == "+": + if inSequence == False: + sys.exit("Sequence of %s is missing" % (line[1:])) + inSequence = False + inQuality = True + if sequenceName != line[1:]: + sys.exit("Names in sequence and qual are different (%s, %s)" % (sequenceName, line[1:])) + outputQualFile.write(">%s\n" % (sequenceName)) + else: + if inSequence: + outputFastaFile.write("%s\n" % (line)) + elif inQuality: + outputQualFile.write("%s\n" % (line)) + + inputFile.close() + outputFastaFile.close() + outputQualFile.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/Java/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/CleanTranscriptFile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CleanTranscriptFile.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,77 @@ + + Clean a transcript file so that it is useable for S-MART. + + PYTHONPATH + + ../Java/Python/CleanTranscriptFile.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #end if + #if $optionType.type == 'Yes': + -t $optionType.value + #end if + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A GFF/GTF file (please consult http://www.sequenceontology.org/gff3.shtml to know more about the GFF3 format, and http://mblab.wustl.edu/GTF22.html for the GTF format) may contain different sources of information: chromosome size, genes, transcripts, etc. S-MART mostly works on transcripts. This scripts filters the input file to keep the information you really want, based on the feature (3rd column). + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/Clusterize.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/Clusterize.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,73 @@ + + Clusterize features when their genomic intervals overlap. + + PYTHONPATH + + + ../Java/Python/clusterize.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -o $outputFileGff + $colinear + $normalize + -d $distance + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The script clusterizes the input genomic data. Two features are clusterized when their genomic intervals overlap. The output is a GFF3 file, where each element is a cluster. The number of elements in the cluster is given by the tag **nbElements**. The name of a cluster is the concatation of the names of its reads (like **read1--read2--read3**). Note that if the size of the name of the cluster exceeds 100 characters, it is truncated to the first 100 characters. + +Some options may clusterize the features which are closer than a given distance. + +By default, the tool clusterizes all features which overlap (or nearly overlap), even if they are on different strands. If you want to clusterize the features which are on the same strand only, you can specify it. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/CollapseReads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CollapseReads.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,67 @@ + + Merges two genomic features if they have exactly the same genomic coordinates. + + PYTHONPATH + + + ../Java/Python/CollapseReads.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -$strand + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Merge two input genomic coordinates iff they are exactly the same. If two or more genomic coordinates are merged, the tag **nbElements** is updated accordingly. As a consequence, all the reads which are exactly the same appear as one genomic coordinate. + +This is especially useful for short RNA sequencing (where you want to count the number of read per miRNA, siRNA, etc.) or 5' capped short reads. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/CompareOverlappingSmallQuery.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CompareOverlappingSmallQuery.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,203 @@ + + Provide the queries that overlap with a reference, when the query data set is small. + + PYTHONPATH + + + ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + -o $outputFileGff + #if $OptionDistance.Dist == 'Yes': + -d $OptionDistance.distance + #end if + #if $OptionMinOverlap.present == 'Yes': + -m $OptionMinOverlap.minOverlap + #end if + #if $OptionPcOverlapQuery.present == 'Yes': + -p $OptionPcOverlapQuery.minOverlap + #end if + #if $OptionPcOverlapRef.present == 'Yes': + -P $OptionPcOverlapRef.minOverlap + #end if + #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear': + -c + #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + $InvertMatch + $NotOverlapping + $OptionInclusionQuery + $OptionInclusionRef + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Invert selection (report those which do not overlap). + +-Restrict to colinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution. + +The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/CompareOverlappingSmallRef.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CompareOverlappingSmallRef.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,203 @@ + + Provide the queries that overlap with a reference, when the reference dataset is small. + + PYTHONPATH + + + ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + -o $outputFileGff + #if $OptionDistance.Dist == 'Yes': + -d $OptionDistance.distance + #end if + #if $OptionMinOverlap.present == 'Yes': + -m $OptionMinOverlap.minOverlap + #end if + #if $OptionPcOverlapQuery.present == 'Yes': + -p $OptionPcOverlapQuery.minOverlap + #end if + #if $OptionPcOverlapRef.present == 'Yes': + -P $OptionPcOverlapRef.minOverlap + #end if + #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear': + -c + #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + $InvertMatch + $NotOverlapping + $OptionInclusionQuery + $OptionInclusionRef + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Invert selection (report those which do not overlap). + +-Restrict to colinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution. + +The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/ConvertTranscriptFile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,98 @@ + + Convert a file from a format to another. + + PYTHONPATH + + ../Java/Python/convertTranscriptFile.py -i $inputFormatType.inputFileName + #if $inputFormatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $inputFormatType.FormatInputFileName == 'bed': + -f bed + #elif $inputFormatType.FormatInputFileName == 'bam': + -f blast + #elif $inputFormatType.FormatInputFileName == 'sam': + -f sam + #elif $inputFormatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -g $outputFormatType.outFormat + + -n $name + $strand + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Simple conversion tool. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/CountReadGCPercent.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CountReadGCPercent.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,20 @@ + + Count GC percent for each read against a genome. + + PYTHONPATH + + ../Java/Python/CountReadGCPercent.py -i $inputFastaFile -j $inputGffFile -o $outputFile + + + + + + + + + + +Count the GC% of a FASTA file. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/GetDifferentialExpression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/GetDifferentialExpression.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,214 @@ + + Get the differential expression between 2 conditions using Fisher's exact test, on regions defined by a third file. + + PYTHONPATH + + + ../Java/Python/GetDifferentialExpression.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + -k $formatTypeRef.inputFileNameRef + #if $formatTypeRef.FormatInputFileNameRef == 'bed': + -l bed + #elif $formatTypeRef.FormatInputFileNameRef == 'gff': + -l gff + #elif $formatTypeRef.FormatInputFileNameRef == 'gff2': + -l gff2 + #elif $formatTypeRef.FormatInputFileNameRef == 'gff3': + -l gff3 + #elif $formatTypeRef.FormatInputFileNameRef == 'sam': + -l sam + #elif $formatTypeRef.FormatInputFileNameRef == 'gtf': + -l gtf + #end if + + -o $outputFileGff + + $simple + $adjusted + + #if $optionSimplePara.simplePara == 'Yes': + -S $optionSimplePara.paraValue + #end if + + #if $optionFixedSizeFactor.FSF == 'Yes': + -x $optionFixedSizeFactor.FSFValue + #end if + + #if $optionFDR.FDR == 'Yes': + -d $optionFDR.FDRValue + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool compares two sets of data and find the differential expression. One very important component of the tool is the reference set. Actually, to use the tool, you need the two input sets of data, of course, and the reference set. The reference set is a set of genomic coordinates and, for each interval, it will count the number of feature on each sample and compute the differential expression. For each reference interval, it will output the direction of the regulation (up or down, with respect to the first input set), and a *p*-value from a Fisher exact test. + +This reference set seems boring. Why not computing the differential expression without this set? The answer is: the differential expression of what? I cannot guess it. Actually, you might want to compare the expression of genes, of small RNAs, of transposable elements, of anything... So the reference set can be a list of genes, and in this case, you can compute the differential expression of genes. But you can also compute many other things. + +Suppose that you cluster the data of your two input samples (you can do it with the *clusterize* and the *mergeTranscriptLists* tools). You now have a list of all the regions which are transcribed in at least one of the input samples. This can be your reference set. This reference set is interesting since you can detect the differential expression of data which is outside any annotation. + +Suppose now that you clusterize using a sliding window the two input samples (you can do it with the *clusterizeBySlidingWindows* and the *mergeSlidingWindowsClusters* tools). You can now select all the regions of a given size which contain at least one read in one of the two input samples (do it with *selectByTag* and the tag **nbElements**). Again, this can be an other interesting reference set. + +In most cases, the sizes of the two input samples will be different, so you should probably normalize the data, which is an available option. The ---rather crude--- normalization increases the number of data in the least populated sample and decreases the number of data in the most populated sample to the average number of data. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/GetFlanking.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/GetFlanking.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,194 @@ + + Get the flanking regions of a set of reference. + + PYTHONPATH + + + ../Java/Python/GetFlanking.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + #if $OptionUpDownStream.OptionUD == 'UpStream': + -5 + #elif $OptionUpDownStream.OptionUD == 'DownStream': + -3 + #end if + + + #if $OptionColinearOrAntiSens.OptionCA == 'Colinear': + -c + #elif $OptionColinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + + #if $OptionMax.maximum == "Yes": + -D $OptionMax.max + #end if + #if $OptionMin.minimum == "Yes": + -d $OptionMin.min + #end if + + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool prints the elements from the second set of genomic intervals which are closest to (in other words, are flanking) the elements from the first set. You can also play on different parameters: + +- restrict the search to downstream or upstream elements, or print downstream and upstream elements, + +- only consider collinear flanking elements, + +- only consider anti-sense flanking elements, + +- only consider elements which are close enough (using some given distance), + +- only consider flanking elements which do not overlap with the reference element. + +Notice that elements from the second sets may be printed at most once, whether they are the flanking element of several elements from the first or not. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/SelectByTag.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/SelectByTag.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,122 @@ + + Keep the genomic coordinates such that a value of a given tag. + + PYTHONPATH + + + ../Java/Python/SelectByTag.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -g $Tag + #if $OptionValue.Value == "Yes": + -a $OptionValue.valeur + #end if + #if $OptionMax.maximum == "Yes": + -M $OptionMax.max + #end if + #if $OptionMin.minimum == "Yes": + -m $OptionMin.min + #end if + + #if $OptionDefault.default == "Yes": + -d $OptionDefault.defaultValue + #end if + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The script reads a list of genomic coordinates and output all the features with specific tag values. If you want to know more about tags, please consult the GFF format page: http://www.sequenceontology.org/gff3.shtml + +The tools reads the input file, and more specifically the tag that you specified. You can mention a lower and a upper bound for its value, or a specific value, and the tool will print all the features such that the tags are between the specified bounds or matches the string. + +A tag has to be present for each feature. If not, you can specify a default value which will be used if the tag is absent. + +This tool can be used to select the clusters with a minimum number of elements (the tag **nbElements** counts the number of elements per clusters) or to select the reads which have mapped less than *n* times (the tag **nbOccurrences** counts the number of mappings per read). + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/WrappGetLetterDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/WrappGetLetterDistribution.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,97 @@ +#! /usr/bin/env python + +import os +import sys +import getopt +from commons.core.checker.CheckerException import CheckerException + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +class WrappGetLetterDistribution(object): + + def __init__(self): + self._inputFileName = "" + self._inputFileFormat = "" + self._outputFileName = "tmpOutputFile" + self._csv = False + + def help( self ): + print + print "usage: %s [ options ]" % ( sys.argv[0] ) + print "options:" + print " -h: this help" + print " -i: input file" + print " -f: 'fasta' or 'fastq'" + print " -c: CSV output file" + print " -a: first PNG output file" + print " -b: second PNG output file" + print + print "Exemple:" + print + print "1:\n\tpython WrappGetLetterDistribution.py -i inputFile.fasta -f fasta -c outputFile1.csv -a outputFile2.png -b outputFile3.png" + print + print "2:\n\tpython WrappGetLetterDistribution.py -i inputFile.fastq -f fastq -c outputFile1.csv -a outputFile2.png -b outputFile3.png" + print + print + + + def setAttributesFromCommandLine(self): + try: + opts, args = getopt.getopt( sys.argv[1:], "hi:f:a:b:c:" ) + except getopt.GetoptError, err: + print str(err); sys.exit(1) + for o, a in opts: + if o == "-h": + self.help() + sys.exit(0) + if o == "-i": + self._inputFileName = a + elif o == "-f": + self._inputFileFormat = a + elif o == "-c": + self._outputFileNameCSV = a + self._csv = True + elif o == "-a": + self._outputFileNamePNG = a + elif o == "-b": + self._outputFileNamePerNtPNG = a + + def checkAttributes(self): + lMsg = [] + if self._inputFileName == "" and not os.path.exists(self._inputFileName): + lMsg.append("ERROR: This input file doesn't exist!") + if self._inputFileFormat == "": + lMsg.append("ERROR: No input file format specified in option!") + if self._outputFileNamePNG == "": + lMsg.append("ERROR: No output file.png specified in option!") + if self._outputFileNamePerNtPNG == "": + lMsg.append("ERROR: No output filePerNt.png specified in option!") + if self._outputFileNameCSV == "" and self._csv == True : + lMsg.append("ERROR: No output file.csv specified in option!") + + print ">>> lMsg " + str(lMsg) + if lMsg != []: + exp = CheckerException() + exp.setMessages(lMsg) + raise (exp) + + def _cleanWorkingDir(self, cDir): + os.system("rm %s/tmpData* %s/tmpScript*" % (cDir, cDir)) + + def wrapp(self): + self.checkAttributes() + cDir = os.getcwd() + + if self._csv == True: + os.system("python %s/Java/Python/getLetterDistribution.py -i %s -f %s -o %s/%s -c" % (SMART_PATH, self._inputFileName, self._inputFileFormat, cDir, self._outputFileName)) + os.system("mv %s/%s.csv %s" % (cDir, self._outputFileName, self._outputFileNameCSV)) + os.system("mv %s/%s.png %s" % (cDir, self._outputFileName, self._outputFileNamePNG)) + os.system("mv %s/%sPerNt.png %s" % (cDir, self._outputFileName, self._outputFileNamePerNtPNG)) + + self._cleanWorkingDir(cDir) + +if __name__ == '__main__': + launcher = WrappGetLetterDistribution() + launcher.setAttributesFromCommandLine() + launcher.wrapp() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/WrappGetLetterDistribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/WrappGetLetterDistribution.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,48 @@ + + Calculate distribution for each nucleotide per position for all short reads + + PYTHONPATH + + + WrappGetLetterDistribution.py -i $inputFileName + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #else : + -f fastq + #end if + -c $ouputFileNameCSV -a $ouputFileNamePNG1 -b $ouputFileNamePNG2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The script gets the nucleotide distribution of the input sequence list. It outputs two files. The first file shows the nucleotide distribution of the data. More precisely, a point (*x*, *y*) on the curve **A** shows that *y* sequences have *x* % of **A**. + +The second plot shows the average nucleotide distribution for each position of the read. You can use it to detect a bias in the first nucleotides, for instance. A point *x*, *y* on the curve **A** shows that at the position *x*, there are *y*% of **A**. A point (*x*, *y*) on the curve **#** tells you that *y* % of the sequences contain not less than *x* nucleotides. By definition, this latter line is a decreasing function. It usually explains why the tail of the other curves are sometimes erratic: there are few sequences. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/changeGffFeatures.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/changeGffFeatures.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,23 @@ + + Change a feature in a GFF file (the feature is the 3rd column). + + PYTHONPATH + + + ../Java/Python/changeGffFeatures.sh $inputFile $inputFeature $outputFeature >$outputFile + + + + + + + + + + + + + This script changes the third column of a GFF3 file (please refer to http://www.sequenceontology.org/gff3.shtml to know more about this format). + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/changeTagName.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/changeTagName.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,51 @@ + + Change the name of a tag in a GFF file. + + PYTHONPATH + + + ../Java/Python/changeTagName.py -i $formatType.inputFileName + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #end if + + -t $Tag + -n $name + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Change the name of a tag in the 9th field of a GFF3 file (please consult http://www.sequenceontology.org/gff3.shtml to know more about this format). + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/clusterizeBySlidingWindows.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/clusterizeBySlidingWindows.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,141 @@ + + Produces a GFF3 file that clusters a list of transcripts using a sliding window. Cluster the data into regions (defined by size and overlap with next region). + + PYTHONPATH + + + ../Java/Python/clusterizeBySlidingWindows.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -s $size + -e $overlap + -o $outputFileGff + $normalize + $strands + + #if $OptionTag.tag == "Yes": + -g $OptionTag.value + #end if + + #if $OptionsOperation.operation == "Yes": + -r $OptionsOperation.value + #end if + + #if $OptionWriteTag.writeTag == "Yes": + -w $OptionWriteTag.value + #end if + + $strand + $plot $plotPng + $excel $excelOutput + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Sliding windows are a convenient ways to clusterize data mapped on the genome. There are two important parameters of a sliding window: the size of the window and the size of the overlap. + +By default, sliding windows count the number of reads in each window. However, you can basically merge any information which is contained in the tags. You can compute the average, sum, median, max or min of the tags for each window. For instance, every window can contain the average cluster size, if you merge clusters instead of reads. + +The output file is a GFF3 file, where each element is a window. There is a special tag for each window, whose name is **nbElements** if you counted the number of transcripts per sliding window. However, if you performed a **min** (resp. **max**, **sum**, **median**, **average**) operation on the tags **value** of the transcripts, then the tag of the window will be **minValue** (resp. **maxValue**, **sumValue**, **medValue**, **avgValue**). You can also specify the name of your tag (which is actually advised: **nbReadsInSample1** will always be more informative than **nbElements**). + +You also have different option, which can select the *n* % highest regions, or the regions with at least *n* features in it, or even the regions with at least *n* unique features. This last option is useful when you want to cluster the reads which have mapped only once, for instance. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/compareOverlapping.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/compareOverlapping.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,310 @@ + + Print all the transcripts from a first file which overlap with the transcripts from a second file. + + PYTHONPATH + + + ../Java/Python/CompareOverlapping.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + -o $outputFileGff + + #if $optionNFirstFile1.NFirstForFile1 == 'Yes': + -S $optionNFirstFile1.firstNtFile1 + #end if + #if $optionNFirstFile2.NFirstForFile2 == 'Yes': + -s $optionNFirstFile2.firstNtFile2 + #end if + #if $optionNLastFile1.NLastForFile1 == 'Yes': + -U $optionNLastFile1.lastNtFile1 + #end if + #if $optionNLastFile2.NLastForFile2 == 'Yes': + -u $optionNLastFile2.lastNtFile2 + #end if + + #if $optionExtentionCinqFile1.extentionFile1 == 'Yes': + -E $optionExtentionCinqFile1.extention51 + #end if + #if $optionExtentionCinqFile2.extentionFile2 == 'Yes': + -e $optionExtentionCinqFile2.extention52 + #end if + + #if $optionExtentionTroisFile1.extentionFile1 == 'Yes': + -N $optionExtentionTroisFile1.extention31 + #end if + #if $optionExtentionTroisFile2.extentionFile2 == 'Yes': + -n $optionExtentionTroisFile2.extention32 + #end if + + #if $OptionColinearOrAntiSens.OptionCA == 'Colinear': + -c + #elif $OptionColinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + + #if $OptionDistance.Dist == 'Yes': + -d $OptionDistance.distance + #end if + + #if $OptionMinOverlap.MO == 'Yes': + -m $OptionMinOverlap.minOverlap + #end if + + $InvertMatch + $ReportIntron + $NotOverlapping + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Restrict query / reference set to the first nucleotide. Useful to check if the TSS of one set overlap with the other one. + +-Extend query / reference set on the 5' / 3' direction. Useful to check if one set is located upstream / downstream the other one. + +-Include introns in the comparison. + +-Invert selection (report those which do not overlap). + +-Restrict to colinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution. + +The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/computeCoverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/computeCoverage.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,113 @@ + + Compute the coverage of a set with respect to another set. + + PYTHONPATH + + + ../Java/Python/ComputeCoverage.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + $ReportIntron + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool considers a query and a reference files, and gives the coverage of the query file by the reference. The output file is similar to the query file, where a tag **coverage** has been added. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/coordinatesToSequence.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/coordinatesToSequence.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,68 @@ + + Coordinates to Sequences: Extract the sequences from a list of coordinates. + + PYTHONPATH + + + ../Java/Python/coordinatesToSequence.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -s $sequence + -o $outputFileFasta + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +You can use this tool, if you just want to convert your mapping data to genomic coordinates, without any filtering. It requires a genomic coordinates file together with its format, an output format (GFF3, BED, etc...), the genome, and prints you the corresponding file. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getDifference.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getDifference.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,133 @@ + + Gets all the regions of the genome, except the one given in an annotation file. Alternatively, it may also give all the elements from the first set which does not ovelap with the second set (at the nucleotide level). + + PYTHONPATH + + + ../Java/Python/getDifference.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + $split + + #if $OptionSequence.option == "Yes": + -s $OptionSequence.sequence + #end if + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tools has two different (but similar) uses. When given two sets of transcripts, it trims the elements of the set so that they do not overlap with the second set. + +When only one set of transcripts is given, together with a reference genome, it produces a list of transcripts which complements the first set. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getDistance.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getDistance.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,247 @@ + + Give the distances between every data from the first input set with respect to the data from the second input set. + + PYTHONPATH + + + ../Java/Python/getDistance.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + + $absolute $proportion + + #if $OptionColinearOrAntiSens.OptionCA == "Colinear": + -c + #elif $OptionColinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + + #if $OptionMinDistance.MinD == "Yes": + -m $OptionMinDistance.minDistance + #end if + + #if $OptionMaxDistance.MaxD == "Yes": + -M $OptionMaxDistance.maxDistance + #end if + + $fivePrime $threePrime $spearMan + + #if $OptionBuckets.OBuckets == "Yes": + -u $OptionBuckets.buckets + #end if + + #if $OptionMinXaxis.MinX == "Yes": + -x $OptionMinXaxis.minXaxis + #end if + + #if $OptionMaxXaxis.MaxX == "Yes": + -X $OptionMaxXaxis.maxXaxis + #end if + + #if $OptionTitle.OTitle == "Yes": + -t $OptionTitle.title + #end if + + -o $outputFilePng + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Give the distances between every data from the first input set and the data from the second input set. It outputs the size distribution. Each point (*x*, *y*) tells you that there exists *y* pairs of elements which are separated by *x* nucleotides. + +The general algorithm is the following. For each element of the first input set, it finds the closest element of the second set and computes the distance between the two elements. The distance is zero if the two elements overlap. This distance may not exist if the element of the first input set is alone on its chromosome (or contig). + +Actually, considering an element from the first input set, the algorithm will look at the vicinity of this element (1kb by default). You can increase the size of the vicinity using the appropriate option. + +As in *compare overlapping*, you can shrink or extend your sets of genomic coordinates, so that you can get the distance between starts of reads and starts or genes, for instance. You can also compute the distance from elements which are on the same strand only (which is not the case by default) or on the opposite strand only. + +You have several options for the output plot. You can first choose the region on the *x*-axis you want to plot. You can also display histograms instead of line plot. In this case, the data are summed into buckets, whose sizes are given as an option. For instance, a bucket of size *s* at the point (*x*, *y*) means that there are *y* pairs of elements which are separated by *x* to *x + s* nucleotides. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getDistribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getDistribution.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,237 @@ + + Get Distribution: Get the distribution of the genomic coordinates along a genome. + + PYTHONPATH + + + ../Java/Python/GetDistribution.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -r $refFile + + #if $optionNbBin.Nb == 'Yes': + -b $optionNbBin.nbBins + #end if + + #if $optionStart.start == 'Yes': + -s $optionStart.startValue + #end if + + #if $optionEnd.end == 'Yes': + -e $optionEnd.endValue + #end if + + #if $optionHeight.height == 'Yes': + -H $optionHeight.heightValue + #end if + + #if $optionWidth.width == 'Yes': + -W $optionWidth.widthValue + #end if + + #if $optionYMin.YMin == 'Yes': + -y $optionYMin.YMinValue + #end if + + #if $optionYMax.YMax == 'Yes': + -Y $optionYMax.YMaxValue + #end if + + #if $optionChrom.chrom == 'Yes': + -c $optionChrom.chromValue + #end if + + #if $optionColor.color == 'Yes': + -l $optionColor.colorValue + #end if + + $bothStrands + $average + $normalize + -m + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Print a density profile of the data for each chromosome. You have to provide the reference genome, to know the sizes of the chromosomes. You can also provide the number of points (called *bins*) you want per chromosome. + +By default, only one curve is plotted per chromosome, but you can plot one curve per strand and per chromosome (the minus strand will be plotted with non-positive values on the *y*-axis). + +If you want, you can also plot a specific region, by mentionning the chromosome, the start and the end positions of the region. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getExons.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getExons.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,87 @@ + + Get the exons of a set of transcripts. + + PYTHONPATH + + + ../Java/Python/getExons.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $optionSelect.Value == "Yes": + -s $optionSelect.selectValue + #end if + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Provide all the exons of an annotation file. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getIntrons.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getIntrons.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,71 @@ + + Get the introns of a set of transcripts. + + PYTHONPATH + + + ../Java/Python/getIntrons.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Provide all the introns of an annotation file. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getReadDistribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getReadDistribution.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,70 @@ + + Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. + + PYTHONPATH + + + ../Java/Python/WrappGetReadDistribution.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #elif $formatType.FormatInputFileName == 'fastq': + -f fastq + #end if + + #if $optionnumber.number == 'Yes': + -n $optionnumber.bestNumber + #end if + #if $optionpercent.percent == 'Yes': + -p $optionpercent.percentage + #end if + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This script gives a .tar out file, if you want to take look at the results, you have to download it. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getSizes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getSizes.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,136 @@ + + Get the sizes of a set of genomic coordinates. + + PYTHONPATH + + + ../Java/Python/getSizes.py -i $formatType.inputFileName $formatType.FormatInputFileName + + #if $OptionQuery.OptionQ == 'NONE': + -q size + #else: + $OptionQuery.OptionQ + #end if + + -o $outputFile + + #if $OptionXMax.xMax == "Yes": + -x $OptionXMax.maxValue + #end if + #if $OptionX.xLab == "Yes": + -a $OptionX.xLabValue + #end if + #if $OptionY.yLab == "Yes": + -b $OptionY.yLabValue + #end if + $barPlot + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Get the sequence/annotation size distribution. A point (*x*, *y*) means that *y* elements have a size of *x* nucleotides. + +When your mapping include exon/intron structures, you can decide to count the size of the introns, the sizes of the exons or the size of the first exons. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getWigData.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getWigData.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,28 @@ + + Compute the average data for some genomic coordinates using WIG files + + PYTHONPATH + + + ../Java/Python/getWigData.py -i $inputGff3File -f gff3 -w $inputWigFile -t $tagName -$strand -o $outputFile + + + + + + + + + + + + + + +Reads a transcript list, computes the average value of some WIG data (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) for each transcript and adds a tag corresponding to this average value to the transcript. + +The script finds all the data which correspond to the genomic coordinates of a transcript, average these data and store the result into a tag. Then, the transcripts are written in an output file, together with the tag. + +You can then plot your data using *plotTranscriptList.py*. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getWigDistance.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getWigDistance.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,28 @@ + + Compute the average data around some genomic coordinates using WIG files (thus covering a large proportion of the genome). + + PYTHONPATH + + + ../Java/Python/getWigDistance.py -i $inputGff3File -f gff3 -w $inputWigFile -a 0.0 -d $distance $strand -o $outputFile + + + + + + + + + + + + + + +Plots the average data contained in a set of WIG files (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) around the first nucleotides of a annotation file. + +The tool needs an transcript list, some WIG files, and a distance. For each transcript, it collects all the values around its first nucleotide, the radius being given by the distance. Then, it computes the average value for each position. A point (*x*, *y*) means that the average value in the WIG file for a nucleotide distant by *x* nucleotides from the first nucleotide of an input transcript is *y*. + +You can possibly use a log scale for the *y*-axis. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/getWigProfile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getWigProfile.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,78 @@ + + Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). + + PYTHONPATH + + + ../Java/Python/getWigProfile.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #end if + -w $inputWigFile + -p $nbPoints + -d $distance + $strands + -o $outputFilePNG + #if $optionSMO.SMO == 'Yes': + -m $optionSMO.smoothen + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Computes the average distribution of the WIG data (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) along the transcripts given in input, and possibly before and after the transcripts. + +The main inputs of the functions are a file containing a list of transcripts (or any sets of genomic interval) and a directory containing a set of WIG files (one file per chromosome, or one file per chromosome and per strand). The function then computes the WIG profile of each transcript. The user can also define a region around the transcripts that should also be plotted (in this case, the profile will include the WIG values which overlap with the transcript as well as the 5' and 3' regions). Since the transcript do not necessarily have the same sizes, all profiles will be extended or shrinked to fit in a size which is given by the user. If the resulting profile is a bit bumpy, the user can also smoothen the curve by using a linear smoothing function (the size of the smoothing window is given by the user). Finally, the user may want to plot the WIG data for the opposite strand too (if the strand specific WUG data are available). + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/mapperAnalyzer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/mapperAnalyzer.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,205 @@ + + Read the output of an aligner, print statistics and possibly translate into GFF, BED or GBrowse formats. + + PYTHONPATH + + + ../Java/Python/mapperAnalyzer.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'bam': + -f bam + #elif $formatType.FormatInputFileName1 == 'seqmap': + -f seqmap + #end if + + -q $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'fasta': + -k fasta + #elif $formatType2.FormatInputFileName2 == 'fastq': + -k fastq + #end if + + + #if $optionnumber.number == 'Yes': + -n $optionnumber.numberVal + #end if + #if $optionsize.size == 'Yes': + -s $optionsize.sizeVal + #end if + #if $optionidentity.identity == 'Yes': + -d $optionidentity.identityVal + #end if + #if $optionmismatch.mismatch == 'Yes': + -m $optionmismatch.mismatchVal + #end if + #if $optiongap.gap == 'Yes': + -p $optiongap.gapVal + #end if + #if $optiontitle.title == 'Yes': + -t $optiontitle.titleVal + #end if + #if $optionappend.append == 'Yes': + -a $optionappend.appendfile + #end if + + $merge + $remove + $remain + -o $outputFileGFF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Maybe the first program you may use. It reads a set of mapping given by the tool you have used to map your data on the reference genome and translate it to a set of genomic coordinates. You also have the possibility to extract only those that you are interested in (few matches in the genome, few errors in the mapping, etc.). You can also select those reads which map less than a given of times in the genome. Moreover, you can output the data in various different formats, which you can use to visualize them *via* UCSC genome browser or GBrowse. Unmatched reads can be written in an other file, in case you would like to try to map them with another tool (may sometimes work!). + +You can filter your data according to: + +- number of errors in the mapping + +- number of occurrences of the mapping in the genome + +- size of the read mapped + +- number of gaps in the mapping + +The script needs an input file (your mapped reads) together with its format and the read sequences file together with its format (FASTA or FASTQ). If you want, you can also append the results of this script to another GFF3 file. This is useful when the GFF3 file is the result of the mapping using another tool. + +By default, any gap in the alignment to the reference sequence is treated like an exon. You can decide to remove this feature by merging short introns (actually, gaps). + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/mergeSlidingWindowsClusters.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/mergeSlidingWindowsClusters.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,110 @@ + + Merges two files containing the results of a sliding windows clustering. + + PYTHONPATH + + + ../Java/Python/mergeSlidingWindowsClusters.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Sliding windows are also useful to compare two (or more!) sets of data. This can be very valuable when you want to compare differential expression in two different conditions. When you have two different sliding windows sets, this function merges them into one, where each window contains the two pieces of information. You may want to plot the data afterwards using the *plot transcript list* function. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/mergeTranscriptLists.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/mergeTranscriptLists.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,153 @@ + + Merge the elements of two lists of genomic coordinates. + + PYTHONPATH + + + ../Java/Python/mergeTranscriptLists.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + $all + $normalize + + #if $OptionDistance.dis == 'Yes': + -d $OptionDistance.disVal + #end if + + #if $OptionColinearOrAntiSens.OptionCA == 'Colinear': + -c + #elif $OptionColinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +The script is similar to *compare overlapping*, except that when data of two different sets overlap, they are merged. You can use the same parameters as *compare overlapping* and use them to look for transcription on both strands, for example. + +Optionally, you can also add to the output all the elements from the first set which do not overlap with the second set. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/modifyGenomicCoordinates.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/modifyGenomicCoordinates.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,138 @@ + + Extend or shrink a list of genomic coordinates. + + PYTHONPATH + + ../Java/Python/modifyGenomicCoordinates.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $OptionStart.start == "Yes": + -s $OptionStart.startValue + #end if + + #if $OptionEnd.end == "Yes": + -e $OptionEnd.endValue + #end if + + #if $OptionFivePrim.five == "Yes": + -5 $OptionFivePrim.fivePValue + #end if + + #if $OptionTroisP.TroisP == "Yes": + -3 $OptionTroisP.ThreePValue + #end if + + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool reads a list of transcripts and modifies each feature by: + +- shrinking it to the *n* first nucleotides or the *n* last nucleotides, or + +- extending it to *n* nucleotides towards the 5' direction (upstream) or the 3' direction (downstream). + +Note that the 5' or 3' direction depends on the orientation of the feature (the 5' end of a transcript located on the minus strand is on the right hand of this transcript!). + +The tool needs a transcript file, its format, and outputs a new transcript file. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/modifySequenceList.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/modifySequenceList.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,52 @@ + + Extend or shring a list of sequences. + + PYTHONPATH + + ../Java/Python/modifySequenceList.py -i $inputFile -f fasta + #if $OptionStart.Start == "Yes": + -s $OptionStart.StartVal + #end if + #if $OptionEnd.End == "Yes": + -e $OptionEnd.EndVal + #end if + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This tool reads a list of sequences (in multi-FASTA/Q format) that you provide and shrinks each sequence to the *n* first nucleotides or the *n* last nucleotides. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/plotCoverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/plotCoverage.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,274 @@ + + Plot the coverage of the first data with respect to the second one. + + PYTHONPATH + + + ../Java/Python/WrappPlotCoverage.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + + #if $optionRef.Ref == 'Yes': + -q $optionRef.inputSequenceFile + #end if + + #if $optionwidth.width == 'Yes': + -w $optionwidth.widthVal + #end if + #if $optionheight.height == 'Yes': + -e $optionheight.heightVal + #end if + #if $optionXlab.Xlab == 'Yes': + -x $optionXlab.XlabVal + #end if + #if $optionYlab.Ylab == 'Yes': + -y $optionYlab.YlabVal + #end if + #if $optiontitle.title == 'Yes': + -t $optiontitle.titleVal + #end if + + #if $optionplusColor.plusColor == 'Yes': + -p $optionplusColor.plusColorVal + #end if + #if $optionminusColor.minusColor == 'Yes': + -m $optionminusColor.minusColorVal + #end if + + #if $optionsumColor.sumColor == 'Yes': + -s $optionsumColor.sumColorVal + #end if + #if $optionlineColor.lineColor == 'Yes': + -l $optionlineColor.lineColorVal + #end if + + $merge + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Plot the coverage of the first set of genomic coordinates with respect to the second set of genomic coordinates. For each element of the second set (we will suppose that they are annotated genes), it computes the number of elements of the first set (reads, for instance) which overlap it. + +Alternatively, if the first file is in GFF format, and contains the **Target** file, you can omit the second file. However, a fasta file corresponding to the second file should be given (to compute the size of the reference elements). + +The tool produces two plots per gene. The first plot gives the coverage: a point (*x*, *y*) means that *y* reads cover the *x* th nucleotide of the gene. The second figure displays the (possibly spliced) gene in black, and the overlapping reads (blue is colinear, red is anti-sense). + +This script gives a .tar out file, if you want to take look at the results, you have to download it. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/plotTranscriptList.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/plotTranscriptList.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,135 @@ + + Plot some information from a list of transcripts. + + PYTHONPATH + + + ../Java/Python/plotTranscriptList.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -x $xVal + -y $yVal + #if $optionz.z == 'Yes': + -z $optionz.zVal + #end if + + -X $XVal + -Y $YVal + -Z $ZVal + + #if $optionxLab.xLab == 'Yes': + -n $optionxLab.labVal + #end if + #if $optionyLab.yLab == 'Yes': + -m $optionyLab.labVal + #end if + + $log + -s $shape + -b $bucket + + -o $outputFilePNG + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Plot the data attached as tags in a transcript list. This can be used for displaying the comparison of different sets of sliding windows, for instance. + +The tool reads the tags of a transcript file (actually, a GFF3 file). It considers more specifically the tag names that you specify as parameter. If you use only one tag name, you can display a line plot. In this case, you have to specify a bucket size *s* (which is by defaut 1) and a point (*x*, *y*) tells you that there are *y* transcripts with tag values *x* to *x + s*. + +You can display could plots if you use two tag names. Each point represents the values of the two tags of a transcript. If you use three variables, the third variable will be the color of the point. You can also use a log scale and name the axes of the plot. + +Each transcript must contain the tags which are specified. If not, you should provide a default value, which is used when the tag is not present. + +If you use a cloud plot, you can compute the Spearman's rho to quantify a correlation between your two tag values. + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/removeExonLines.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/removeExonLines.sh Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,2 @@ +#!/bin/bash +sed '/exon/d' $1 diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/removeExonLines.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/removeExonLines.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,18 @@ + + Removes the lines containing Exon. + + PYTHONPATH + + ../Java/Python/removeExonLines.sh $inputFile > $outputFile + + + + + + + + + + command example: sh removeExonLines.sh input.gff3 + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/restrictFromSize.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/restrictFromSize.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,102 @@ + + Select the elements of a list of sequences or transcripts with a given size. + + PYTHONPATH + + + ../Java/Python/restrictFromSize.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #elif $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $OptionMax.maximum == "Yes": + -M $OptionMax.max + #end if + #if $OptionMin.minimum == "Yes": + -m $OptionMin.min + #end if + + -o $outputFileGff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Reads a list of sequences or genomic coordinates and outputs those which are longer and / or shorter than a given size ---which you provide. + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/restrictTranscriptList.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/restrictTranscriptList.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,124 @@ + + Select the features which are located in a given locus. + + PYTHONPATH + + ../Java/Python/restrictTranscriptList.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $OptionChrom.Chrom == "Yes": + -c $OptionChrom.ChromName + #end if + + #if $OptionStart.start == "Yes": + -s $OptionStart.startValue + #end if + + #if $OptionEnd.end == "Yes": + -e $OptionEnd.endValue + #end if + + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Reads a list of genomic coordinates and outputs those which on a given chromosome and / or between two given positions. + + + + + + + + + + + + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/test.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,97 @@ +#! /usr/bin/env python + +import os +import sys +import getopt +from commons.core.checker.CheckerException import CheckerException + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +class WrappGetLetterDistribution(object): + + def __init__(self): + self._inputFileName = "" + self._inputFileFormat = "" + self._outputFileName = "tmpOutputFile" + self._csv = False + + def help( self ): + print + print "usage: %s [ options ]" % ( sys.argv[0] ) + print "options:" + print " -h: this help" + print " -i: input file" + print " -f: 'fasta' or 'fastq'" + print " -c: CSV output file" + print " -a: first PNG output file" + print " -b: second PNG output file" + print + print "Exemple:" + print + print "1:\n\tpython WrappGetLetterDistribution.py -i inputFile.fasta -f fasta -c outputFile1.csv -a outputFile2.png -b outputFile3.png" + print + print "2:\n\tpython WrappGetLetterDistribution.py -i inputFile.fastq -f fastq -c outputFile1.csv -a outputFile2.png -b outputFile3.png" + print + print + + + def setAttributesFromCommandLine(self): + try: + opts, args = getopt.getopt( sys.argv[1:], "hi:f:a:b:c:" ) + except getopt.GetoptError, err: + print str(err); sys.exit(1) + for o, a in opts: + if o == "-h": + self.help() + sys.exit(0) + if o == "-i": + self._inputFileName = a + elif o == "-f": + self._inputFileFormat = a + elif o == "-c": + self._outputFileNameCSV = a + self._csv = True + elif o == "-a": + self._outputFileNamePNG = a + elif o == "-b": + self._outputFileNamePerNtPNG = a + + def checkAttributes(self): + lMsg = [] + if self._inputFileName == "" and not os.path.exists(self._inputFileName): + lMsg.append("ERROR: This input file doesn't exist!") + if self._inputFileFormat == "": + lMsg.append("ERROR: No input file format specified in option!") + if self._outputFileNamePNG == "": + lMsg.append("ERROR: No output file.png specified in option!") + if self._outputFileNamePerNtPNG == "": + lMsg.append("ERROR: No output filePerNt.png specified in option!") + if self._outputFileNameCSV == "" and self._csv == True : + lMsg.append("ERROR: No output file.csv specified in option!") + + print ">>> lMsg " + str(lMsg) + if lMsg != []: + exp = CheckerException() + exp.setMessages(lMsg) + raise (exp) + + def _cleanWorkingDir(self, cDir): + os.system("rm %s/tmpData* %s/tmpScript*" % (cDir, cDir)) + + def wrapp(self): + self.checkAttributes() + cDir = os.getcwd() + + if self._csv == True: + os.system("python %s/Java/Python/getLetterDistribution.py -i %s -f %s -o %s/%s -c" % (SMART_PATH, self._inputFileName, self._inputFileFormat, cDir, self._outputFileName)) + os.system("mv %s/%s.csv %s" % (cDir, self._outputFileName, self._outputFileNameCSV)) + os.system("mv %s/%s.png %s" % (cDir, self._outputFileName, self._outputFileNamePNG)) + os.system("mv %s/%sPerNt.png %s" % (cDir, self._outputFileName, self._outputFileNamePerNtPNG)) + + self._cleanWorkingDir(cDir) + +if __name__ == '__main__': + launcher = WrappGetLetterDistribution() + launcher.setAttributesFromCommandLine() + launcher.wrapp() + diff -r d22fadc825e3 -r 2c0c0a89fad7 SMART/galaxy/trimSequences.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/trimSequences.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,82 @@ + + Remove the 5' and/or 3' adapters of a list of reads. + + PYTHONPATH + + ../Java/Python/trimSequences.py -i $inputFile -f fastq + #if $OptionFPADP.FPADP == "Yes": + -5 $OptionFPADP.fivePAdaptor + #end if + #if $OptionTPADP.TPADP == "Yes": + -3 $OptionTPADP.threePAdaptor + #end if + -e $errors + $indels + $noAdaptor5p $noAdaptorFile5p + $noAdaptor3p $noAdaptorFile3p + -o $outputFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + noAdaptor5p + + + noAdaptor3p + + + + +This function removes the adaptor from the 5' or 3' end of your reads. It can even recognize the adaptators which are partially present. You can specify whether you are ready to accept indels or not. + + + + + + + + + + + + + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 __init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/LoggerFactory.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/LoggerFactory.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,139 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## @mainpage Documentation of the REPET API +# +# Welcome to the API documentation! +# This API is a set of packages and classes for pipeline(s) development. +# +# @par The "logger" package +# +# Logging is managed via LoggerFactory. This class creates instances of logging.logging python class. It's strongly encouraged to use this factory each time you need to log something. +# +# @par The "checker" package +# +# This package is a set of classes designed to facilitate development of different kind of checks: filesystem checks, environment checks, configuration file checks ... +# +# Classes should subclass checker::IChecker or if a logger is needed: checker::AbstractChecker. +# +# Methods should raise checker::CheckerException. +# +# Use checker::ConfigChecker and checker::ConfigException for configuration files checks. +# +# checker::CheckerUtils is a set of small static methods shared by other classes of checker package. +# +# @par The "coord" package +# +# This package is a set of classes dedicated to coordinates manipulations. +# +# A coord::Range instance records a region on a given sequence (start, end and sequence name). +# +# A coord::Map instance is a coord::Range instance and record a named region on a given sequence (start, end, sequence name and name). +# +# A coord::Set instance is a coord::Map instance and record a named region on a given sequence with an identifier (start, end, sequence name, name and id). +# +# A coord::Align instance handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity). +# +# A coord::Path instance is a coord::Align instance and handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) with an identifier. +# +# A coord::Match instance is a coord::Path instance and handle a chain of match(es) between two sequences, query and subject, with an identifier and the length of the input sequences. +# +# coord::Align, coord::Map, coord::Path and coord::Set come with utils classes: coord::AlignUtils, coord::MapUtils, coord::PathUtils and coord::SetUtils. +# +# @par The "seq" package +# +# This package a set of classes dedicated to sequences manipulations. +# +# A seq::Bioseq instance records a sequence with its header. seq::Bioseq comes with an utils class: seq::BioseqUtils. +# +# A seq::BioseqDB instance handle a collection of a Bioseq (header-sequence). +# +# A seq::AlignedBioseqDB instance is a multiple sequence alignment representation. +# +# A seq::FastaUtils is a set of static methods for fasta file manipulation. +# +# @par The "sql" package +# +# This package is dedicated to persistance of coord package objects. +# All classes come with dedicated interfaces. Use these interfaces for class manipulation. +# Class names patterns are ITable*Adaptator and Table*Adaptator. +# +# sql::ITablePathAdaptator, sql::TablePathAdaptator / +# sql::ITableSetAdaptator, sql::TableSetAdaptator / +# sql::ITableSeqAdaptator, sql::TableSeqAdaptator / +# sql::ITableMapAdaptator, sql::TableMapAdaptator / +# sql::ITableMatchAdaptator, sql::TableMatchAdaptator. +# + +import logging +import sys + +DEFAULT_LEVEL = 1 +DEFAULT_FORMAT = "%(asctime)s - %(module)s - %(levelname)s - %(message)s" +DATE_FORMAT = "%Y-%m-%d %H:%M:%S" + +## Use this class to create a instance of logging class. +# +class LoggerFactory(object): + + def createLogger(name, verbosity = DEFAULT_LEVEL, format = DEFAULT_FORMAT, out = sys.stdout): + log = logging.getLogger(name) + + hasStreamHandler = False + for handler in log.handlers: + if handler.__class__ == logging.StreamHandler: + hasStreamHandler = True + break + if not hasStreamHandler: + formatter = logging.Formatter(format, DATE_FORMAT) + handler = logging.StreamHandler(out) + handler.setFormatter(formatter) + log.addHandler(handler) + + LoggerFactory.setLevel(log, verbosity) + return log + + createLogger = staticmethod(createLogger) + + def setLevel(log, verbosity): + log.disabled = False + if verbosity >= 4: + log.setLevel(logging.DEBUG) + elif verbosity == 3: + log.setLevel(logging.INFO) + elif verbosity == 2: + log.setLevel(logging.WARNING) + elif verbosity == 1: + log.setLevel(logging.ERROR) + elif verbosity == 0: + log.disabled = True + + setLevel = staticmethod(setLevel) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/AbstractChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/AbstractChecker.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,61 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.IChecker import IChecker +from commons.core.LoggerFactory import LoggerFactory + + +## Enable a Logger in your Checker. +# +# Subclasses of AbstractChecker have a already a logger enabled (referenced by self._log attribute). Subclasses also already implements IChecker. +# All you have to do is to call __init__() method in your own constructor. +class AbstractChecker( IChecker ): + + ## Constructor + # + # @param logFileName name of log file where logger outputs + # + def __init__(self, logFileName): + self._log = LoggerFactory.createLogger(logFileName) + + + ## Set (change) default logger + # + # @param logger a new logger + # + def setLogger(self, logger): + self._log = logger + + + ## Return the logger instance + # + def getLogger(self): + return self._log diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/CheckerException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/CheckerException.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,52 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Exception raised during check +# +# This class wraps Exception class +# +class CheckerException( Exception ): + + ## Constructor + # + # @param msg message embedded in Exception class + def __init__(self,msg=""): + self.messages = [] + self.msg = msg + Exception.__init__(self, msg) + + + def setMessages(self,lMessages): + self.messages = lMessages + + + def getMessages(self): + return self.messages diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/CheckerUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/CheckerUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,316 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import re +import glob +import ConfigParser +from ConfigParser import NoOptionError +from ConfigParser import NoSectionError +from commons.core.checker.CheckerException import CheckerException + + +## A set of static methods used to perform checks. +# +# +class CheckerUtils( object ): + + ## Check if blastName param is in ["blastn", "blastp", "blastx", "tblastn", "tblastx"] + # + # @param blastName name to check + # @return True if name is in list False otherwise + # + def isBlastNameNotInBlastValues( blastName ): + blastValuesSet = set( ["blastn", "blastp", "blastx", "tblastn", "tblastx"] ) + blastNameSet = set( [ blastName ] ) + return not blastNameSet.issubset( blastValuesSet ) + + isBlastNameNotInBlastValues = staticmethod( isBlastNameNotInBlastValues ) + + + ## Check if param is NOT "TRUE" and NOT false "FALSE" + # + # @param param str to check + # @return True if param is not eq to "TRUE" AND not eq to "FALSE", false otherwise + # + def isNotTRUEisNotFALSE( param ): + return param != "TRUE" and param != "FALSE" + + isNotTRUEisNotFALSE = staticmethod( isNotTRUEisNotFALSE ) + + + ## Check if resource (file or dir) do NOT exists + # + # @param resource file or dir to check + # @return True if resource exists False otherwise + # + def isRessourceNotExits( resource ): + return not os.path.exists( resource ) + + isRessourceNotExits = staticmethod( isRessourceNotExits ) + + + ## Check a specific E-value format: de-dd + # + # @param param E-value to check + # @return True if format is de-dd False otherwise + # + def isNotAeValueWithOneDigit2DecimalsAtLeast( param ): + # \d\d stands for 2 digits and more ??? + return not re.match( "\de\-\d\d", param ) + + isNotAeValueWithOneDigit2DecimalsAtLeast = staticmethod( isNotAeValueWithOneDigit2DecimalsAtLeast ) + + + ## Check a number format + # + # @param param value to check + # @return True if param is a number (d+) False otherwise + # + def isNotANumber( param ): + return not re.match( "\d+", param ) + + isNotANumber = staticmethod( isNotANumber ) + + + ## Check if an executable is in the user's PATH + # + # @param exeName name of the executable + # @return True if executable in user's PATH, False otherwise + # + def isExecutableInUserPath( exeName ): + dirPathList = os.environ["PATH"].split(":") + for dirPath in dirPathList: + if os.path.isdir( dirPath ): + try: + binPathList = glob.glob( dirPath + "/*" ) + except OSError, e: + continue + for binPath in binPathList: + bin = os.path.basename( binPath ) + if bin == exeName: + return True + return False + + isExecutableInUserPath = staticmethod( isExecutableInUserPath ) + + + ## Return the full path of a given executable + # + def getFullPathFromExecutable( exeName ): + lDirFromUserPath = os.environ["PATH"].split(":") + for dir in lDirFromUserPath: + if os.path.isdir( dir ): + try: + lExecutables = glob.glob( "%s/*" % ( dir ) ) + except OSError, e: + continue + for exe in lExecutables: + path, exe = os.path.split( exe ) + if exe == exeName: + return path + return "" + + getFullPathFromExecutable = staticmethod( getFullPathFromExecutable ) + + + #TODO: to remove ? + ## Check if a queue Name is valid. Warning: Only with the queue manager SGE + # + # @param fullQueueName name of the queue to test (with or without parameters) + # @return True if queue name is valid, False otherwise + # + def isQueueNameValid( fullQueueName ): + queueName = fullQueueName.split()[0] + if queueName == "none": + return True + queueFile = "queueName.txt" + if not CheckerUtils.isExecutableInUserPath( "qconf" ): + msg = "executable 'qconf' can't be found" + sys.stderr.write( "%s\n" % ( msg ) ) + return False + cmd = "qconf -sql > " + queueFile + os.system( cmd ) + queueFileHandler = open( queueFile, "r" ) + lQueueNames = queueFileHandler.readlines() + queueFileHandler.close() + os.remove( queueFile ) + queueNameValid = False + for qName in lQueueNames: + qName = qName.strip() + if qName == queueName: + queueNameValid = True + break + return queueNameValid + + isQueueNameValid = staticmethod( isQueueNameValid ) + + + ## Check if a string length is lower or equal than 15 + # + # @param strName any string + # @return True if string length is <= 15, False otherwise + # + def isMax15Char( strName ): + return (len(strName) <= 15 ) + + isMax15Char = staticmethod( isMax15Char ) + + + ## Check if a string is made with only alphanumeric or underscore character + # + # @param strName any string + # @return True if string is with alphanumeric or underscore, False otherwise + # + def isCharAlphanumOrUnderscore( strName ): + # authorized ALPHABET [a-z,A-Z,0-9,_] + p = re.compile('\W') + errList=p.findall(strName) + if len( errList ) > 0 : + return False + else: + return True + + isCharAlphanumOrUnderscore = staticmethod( isCharAlphanumOrUnderscore ) + + + ## Check if sectionName is in the configuration file + # + # @param config filehandle of configuration file + # @param sectionName string of section name to check + # @exception NoSectionError: if section not found raise a NoSectionError + # + def checkSectionInConfigFile( config, sectionName ): + if not (config.has_section(sectionName)): + raise NoSectionError(sectionName) + + checkSectionInConfigFile = staticmethod( checkSectionInConfigFile ) + + + ## Check if an option is in a specified section in the configuration file + # + # @param config filehandle of configuration file + # @param sectionName string of section name + # @param optionName string of option name to check + # @exception NoOptionError: if option not found raise a NoOptionError + # + def checkOptionInSectionInConfigFile( config, sectionName, optionName ): + config.get( sectionName, optionName ) + + checkOptionInSectionInConfigFile = staticmethod( checkOptionInSectionInConfigFile ) + + + ## Check version number coherency between configFile and CHANGELOG + # + # @param config ConfigParser Instance of configuration file + # @param changeLogFileHandle CHANGELOG file handle + # @exception NoOptionError: if option not found raise a NoOptionError + # + def checkConfigVersion( changeLogFileHandle, config ): + line = changeLogFileHandle.readline() + while not line.startswith("REPET release "): + line = changeLogFileHandle.readline() + numVersionChangeLog = line.split()[2] + + numVersionConfig = config.get("repet_env", "repet_version") + + if not numVersionChangeLog == numVersionConfig: + message = "*** Error: wrong config file version. Expected version num is " + numVersionChangeLog + " but actual in config file is " + numVersionConfig + raise CheckerException(message) + + checkConfigVersion = staticmethod( checkConfigVersion ) + + + ## Get version number from CHANGELOG + # + # @param changeLogFile CHANGELOG file name + # + def getVersionFromChangelogFile(changeLogFileName): + with open(changeLogFileName) as changeLogFileHandle: + line = changeLogFileHandle.readline() + while not line.startswith("REPET release "): + line = changeLogFileHandle.readline() + numVersionChangeLog = line.split()[2] + return numVersionChangeLog + + + getVersionFromChangelogFile = staticmethod( getVersionFromChangelogFile ) + + + ## Check if headers of an input file contain only alpha numeric characters and "_ : . -" + # + # @param fileHandler file handle + # @exception CheckerException if bad header raise a CheckerException + # + def checkHeaders( fileHandler ): + lHeaders = CheckerUtils._getHeaderFromFastaFile(fileHandler) + p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE) + lWrongHeaders = [] + for header in lHeaders: + errList=p.findall(header) + if len( errList ) > 0 : + lWrongHeaders.append(header) + if lWrongHeaders != []: + exception = CheckerException() + exception.setMessages(lWrongHeaders) + raise exception + + checkHeaders = staticmethod( checkHeaders ) + + + def _getHeaderFromFastaFile( inFile ): + lHeaders = [] + while True: + line = inFile.readline() + if line == "": + break + if line[0] == ">": + lHeaders.append( line[1:-1] ) + return lHeaders + + _getHeaderFromFastaFile = staticmethod( _getHeaderFromFastaFile ) + + + ## Return True if an option is in a specified section in the configuration file, False otherwise + # + # @param config handler of configuration file + # @param sectionName string of section name + # @param optionName string of option name to check + # + def isOptionInSectionInConfig( configHandler, section, option ): + try: + CheckerUtils.checkOptionInSectionInConfigFile( configHandler, section, option ) + except NoOptionError: + return False + return True + + isOptionInSectionInConfig = staticmethod( isOptionInSectionInConfig ) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/ConfigChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/ConfigChecker.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,226 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import re +import sys +from commons.core.utils.RepetConfigParser import RepetConfigParser +from commons.core.checker.ConfigValue import ConfigValue +from commons.core.checker.IChecker import IChecker +from commons.core.checker.RepetException import RepetException +from commons.core.utils.FileUtils import FileUtils + + +class Rule(object): + + def __init__(self, mandatory= False, isPattern=False, type="", set=(), help =""): + self.mandatory = mandatory + self.isPattern = isPattern + self.type = type + self.set = set + self.help = help + +class ConfigRules(object): + + def __init__(self, configName = "", configDescription = ""): + self.configName = configName + self.configDescription = configDescription + self.dRules4Sections={} + + def _addRule(self, section, option="DEFAULT", mandatory=False, isPattern=False, type="", set=(), help =""): + if not self.dRules4Sections.has_key(section): + self.dRules4Sections[section] = {} + self.dRules4Sections[section][option]=Rule(mandatory, isPattern, type.lower(), set) + + def addRuleSection(self, section, mandatory=False, isPattern=False, help = ""): + self._addRule(section = section, option = "DEFAULT", mandatory = mandatory, isPattern = isPattern, help = "") + + def addRuleOption(self, section, option, mandatory=False, isPattern=False, type="", set=(), help = ""): + self._addRule(section = section, option = option, mandatory = mandatory, isPattern = isPattern, type = type, set=set , help = "") + + def isSectionMandatory(self, section): + if self.dRules4Sections.has_key(section): + if self.dRules4Sections[section].has_key("DEFAULT"): + return self.dRules4Sections[section]["DEFAULT"].mandatory + return False + + def isOptionMandatory(self, section, option): + if self.dRules4Sections.has_key(section): + if self.dRules4Sections[section].has_key(option): + return self.dRules4Sections[section][option].mandatory + return False + + def getRule(self, section, option): + if self.dRules4Sections.has_key(section): + if self.dRules4Sections[section].has_key(option): + return self.dRules4Sections[section][option] + return None + +class ConfigChecker(IChecker): + + def __init__ (self, cfgFileName, iCfgRules): + self._configFileName = cfgFileName + self._iConfigRules = iCfgRules + self._iRawConfig = ConfigValue() + self._iExtendedConfigRules = ConfigRules() + + def readConfigFile(self): + iConfig = RepetConfigParser() + try: + iConfig.readfp(open(self._configFileName)) + return iConfig +# TODO USE OF CONFIG ERROR +# if DuplicateSectionError: +# raise Exception ("Duplicate section exist in config file %s" %(self._configFileName )) + except : + raise RepetException ("Unexpected error: %s" % sys.exc_info()[0]) + + def setRawConfig(self, iConfig ): + for sectionName in iConfig.sections(): + for optionName in iConfig.options(sectionName): + optionValue = iConfig.get(sectionName, optionName) + self._iRawConfig.set(sectionName, optionName, optionValue) + + def getOptionValueAccordingRule(self, iConfig, sectionName, optionName): + optionRule = self._iExtendedConfigRules.getRule(sectionName, optionName) + if optionRule == None : + return iConfig.get(sectionName, optionName) + + if optionRule.type == "int": + optionValue = iConfig.getint(sectionName, optionName) + elif optionRule.type == "float": + optionValue = iConfig.getfloat(sectionName, optionName) + elif optionRule.type == "bool" or optionRule.type == "boolean": + optionValue = iConfig.getboolean(sectionName, optionName) + else: + optionValue = iConfig.get(sectionName, optionName) + if optionRule.set!=() and not(optionValue in optionRule.set): + #TODO : test and fix + raise RepetException ("value must be in %s " % set.__repr__()) + + return optionValue + + def setConfig(self, iConfig ): + config = ConfigValue() + valueErr = "" + for sectionName in iConfig.sections(): + for optionName in iConfig.options(sectionName): + try: + optionValue = self.getOptionValueAccordingRule(iConfig, sectionName, optionName ) + config.set(sectionName, optionName, optionValue) + except RepetException, re : + #TODO : test and fix + valueErr += "\n - [%s]: %s %s" % re.getMessage() + if valueErr == "": + self._iRawConfig = config + else: + raise RepetException ("Following errors occurs:%s\n" %valueErr) + + def checkIfExistsConfigFile (self): + if not (FileUtils.isRessourceExists(self._configFileName)): + raise RepetException("CONFIG FILE not found - '%s'" % self._configFileName) + + def checkMandatorySections (self): + missingSection = "" + for sectionName in self._iExtendedConfigRules.dRules4Sections.keys(): + if self._iExtendedConfigRules.isSectionMandatory(sectionName) and not self._iRawConfig.has_section(sectionName): + missingSection += "\n - %s" %(sectionName) + if missingSection != "": + raise RepetException ("Error in configuration file %s, following sections are missing:%s\n" % (self._configFileName, missingSection)) + + def checkMandatoryOptions (self): + missingOption = "" + for sectionName in self._iExtendedConfigRules.dRules4Sections.keys(): + if self._iExtendedConfigRules.isSectionMandatory(sectionName) or self._iRawConfig.has_section(sectionName) : + dRules4OptionsOfThisSection = self._iExtendedConfigRules.dRules4Sections[sectionName] + for optionName in dRules4OptionsOfThisSection.keys(): + if optionName != "DEFAULT" and self._iExtendedConfigRules.isOptionMandatory(sectionName, optionName) and not self._iRawConfig.has_option(sectionName, optionName): + missingOption += "\n - [%s]: %s" % (sectionName, optionName) + if missingOption != "": + raise RepetException ("Error in configuration file %s, following options are missing: %s\n" % (self._configFileName, missingOption)) + + def getSectionNamesAccordingPatternRules (self, sectionWordOrPattern, isPattern): + lSectionsFoundAccordingPatternRules=[] + if isPattern == False: + if self._iRawConfig.has_section(sectionWordOrPattern): + lSectionsFoundAccordingPatternRules.append(sectionWordOrPattern) + else: + for sectionName in self._iRawConfig.sections(): + if re.search(sectionWordOrPattern, sectionName, re.IGNORECASE): + lSectionsFoundAccordingPatternRules.append(sectionName) + return lSectionsFoundAccordingPatternRules + + def getOptionsNamesAccordingPatternRules(self, sectionName, optionWordOrPattern, isPattern): + lOptionsFoundAccordingPatternRules=[] + if isPattern == False: + if self._iRawConfig.has_option(sectionName, optionWordOrPattern): + lOptionsFoundAccordingPatternRules.append(optionWordOrPattern) + else : + for optionName in self._iRawConfig.options(sectionName): + if re.search(optionWordOrPattern, optionName, re.IGNORECASE)!= None: + lOptionsFoundAccordingPatternRules.append(optionName) + return lOptionsFoundAccordingPatternRules + + def extendConfigRulesWithPatternRules(self): + for sectionName in self._iConfigRules.dRules4Sections.keys(): + dRules4OptionsOfThisSection = self._iConfigRules.dRules4Sections[sectionName] + lRawSections=[] + if dRules4OptionsOfThisSection.has_key("DEFAULT"): + mandatorySection = dRules4OptionsOfThisSection["DEFAULT"].mandatory + isPatternSection = dRules4OptionsOfThisSection["DEFAULT"].isPattern + lRawSections=self.getSectionNamesAccordingPatternRules(sectionName, isPatternSection) + for rawSectionName in lRawSections: + self._iExtendedConfigRules.addRuleSection(rawSectionName, "DEFAULT", mandatorySection ) + if mandatorySection and (len(lRawSections)==0): + self._iExtendedConfigRules.addRuleSection(sectionName, "DEFAULT", mandatorySection ) + else: + lRawSections.append(sectionName) + for optionName in dRules4OptionsOfThisSection.keys(): + setOption = dRules4OptionsOfThisSection[optionName].set + isPatternOption = dRules4OptionsOfThisSection[optionName].isPattern + mandatoryOption = dRules4OptionsOfThisSection[optionName].mandatory + typeOption = dRules4OptionsOfThisSection[optionName].type + if optionName != "DEFAULT": + for rawSectionName in lRawSections: + lRawOptions=self.getOptionsNamesAccordingPatternRules(rawSectionName, optionName, isPatternOption) + for rawOptionName in lRawOptions: + self._iExtendedConfigRules.addRuleOption(rawSectionName, rawOptionName, mandatoryOption, False, typeOption, setOption) + if mandatoryOption and (len(lRawOptions)==0): + self._iExtendedConfigRules.addRuleOption(rawSectionName, optionName, mandatoryOption, False, typeOption, setOption) + + def getConfig(self): + self.checkIfExistsConfigFile() + iConfig = self.readConfigFile() + self.setRawConfig(iConfig) + self.extendConfigRulesWithPatternRules() + self.checkMandatorySections() + self.checkMandatoryOptions() + self.setConfig(iConfig) + return self._iRawConfig \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/ConfigException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/ConfigException.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,53 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.checker.RepetException import RepetException + +## A exception raised by check() method of class ConfigChecker +# +# This class allow storage of multiple messages (see messages attribute). +# Example: use one instance of ConfigException class for one section in configuration file. +# All messages relatives to this section are stored in messages attribute. +class ConfigException( RepetException ): + + ## Constructor + # + # @param msg message embedded in Exception class + # + def __init__(self, msg, messages = []): + RepetException.__init__(self, msg) + self.messages = messages + + def getMessages(self): + return self.messages + + def setMessages(self, messages): + self.messages = messages + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/ConfigValue.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/ConfigValue.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,70 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +class ConfigValue(object): + + def __init__(self): + self.dOptionsValues4Sections={} + + def has_section(self,sectionName): + return self.dOptionsValues4Sections.has_key(sectionName) + + def has_option(self, sectionName, optionName): + isOptionExist = False + if self.has_section(sectionName): + isOptionExist = self.dOptionsValues4Sections[sectionName].has_key(optionName) + return isOptionExist + + def sections(self): + lSectionsKeys = self.dOptionsValues4Sections.keys() + return lSectionsKeys + + def options(self, sectionName): + lOptionsKeys = [] + if self.has_section(sectionName): + lOptionsKeys = self.dOptionsValues4Sections[sectionName].keys() + return lOptionsKeys + + def get(self, sectionName, optionName): + if self.has_option(sectionName, optionName): + return self.dOptionsValues4Sections[sectionName][optionName] + return None + + def set(self, sectionName, optionName, optionValue): + if not (self.has_section(sectionName)): + self.dOptionsValues4Sections[sectionName] = {} + self.dOptionsValues4Sections[sectionName][optionName] = optionValue + + def setdOptionsValues4Sections(self, dOptionsValues4Sections): + self.dOptionsValues4Sections = dOptionsValues4Sections + + def __eq__(self, o): + return self.dOptionsValues4Sections == o.dOptionsValues4Sections diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/IChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/IChecker.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,45 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Interface for a checker +# +# This class emulates an interface for a checker. +# +# All checkers are subclasses of IChecker. +# +class IChecker( object ): + + ## perform check, raise a CheckerException if error occurred + # + # @param arg a collecting parameter: put here all you need to perform check + # + def check(self, arg=""): + pass diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/OldConfigChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/OldConfigChecker.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,101 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import ConfigParser +from ConfigParser import NoOptionError +from commons.core.checker.IChecker import IChecker +from commons.core.checker.ConfigException import ConfigException + + +## A checker for a configuration file +# +# +# A configuration file is formatted as follow: +# +# [section1] +# +# option_name1: option_value1 +# +# option_name2: option_value2 +# +# option_name3: option_value3 +# +# [section2] +# +# ... +# +# +# This class performs 3 checkes on a configuration file: +# +# (i) check if file exists +# +# (ii) check if section exists +# +# (iii) check if option exists +# +class ConfigChecker( IChecker ): + + ## Constructor A checker for configuration file. + # + # @param sectionName name of section to check in configuration file + # @param optionsDict dictionary with option(s) to check as keys and empty strings ("") as values + def __init__ (self, sectionName, optionsDict): + self._sectionName = sectionName + self._optionsDict = optionsDict + + + ## Perform 3 checks : file exists, sections exists, option exists + # + # @param configFile configuration file to check + # @exception ConfigException with a list of messages + def check (self, configFile): + config = ConfigParser.ConfigParser() + msg = [] + try: + config.readfp( open(configFile) ) + except IOError, e: + msg.append("CONFIG FILE not found - " + e.message) + raise ConfigException("", msg) + + if not (config.has_section(self._sectionName)): + msg.append("[" + self._sectionName + "]" + " section not found - ") + raise ConfigException("", msg) + + isExceptionOccured = False + for key in self._optionsDict.keys(): + try: + self._optionsDict[key] = config.get(self._sectionName, key) + except NoOptionError, e: + msg.append("[" + self._sectionName + "]" + " - " + e.message) + isExceptionOccured = True + + if (isExceptionOccured): + raise ConfigException("", msg) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/RepetException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/RepetException.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,51 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +class RepetException(Exception): + + def __init__(self, msg): + Exception.__init__(self) + self._message = msg + + def __str__(self): + return self._message + + def getMessage(self): + return self._message + + def setMessage(self, msg): + self._message = msg + + +class RepetDataException(RepetException): + + def __init__(self, msg): + RepetException.__init__(self, msg) \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/checker/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/Align.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Align.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,428 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import time + +from commons.core.coord.Range import Range +from commons.core.coord.Map import Map + + +## Handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) +# +class Align( object ): + + ## Constructor + # + # @param range_q: a Range instance for the query + # @param range_s: a Range instance for the subject + # @param e_value: E-value of the match + # @param identity: identity percentage of the match + # @param score: score of the match + # + def __init__(self, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0): + self.range_query = range_q + self.range_subject = range_s + self.e_value = float(e_value) + self.score = float(score) + self.identity = float(identity) + + ## Return True if the instance is empty, False otherwise + # + def isEmpty(self): + return self.range_query.isEmpty() or self.range_subject.isEmpty() + + ## Equal operator + # + def __eq__(self, o): + if self.range_query==o.range_query and self.range_subject==o.range_subject and \ + self.e_value==o.e_value and self.score==o.score and self.identity==o.identity: + return True + return False + + ## Unequal operator + # + # @param o a Range instance + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Convert the object into a string + # + # @note used in 'print myObject' + # + def __str__( self ): + return self.toString() + + ## Read attributes from an Align file + # + # @param fileHandler: file handler of the file being read + # @return: 1 on success, 0 at the end of the file + # + def read(self, fileHandler): + self.reset() + line = fileHandler.readline() + if line == "": + return 0 + tokens = line.split("\t") + if len(tokens) < len(self.__dict__.keys()): + return 0 + self.setFromTuple(tokens) + return 1 + + ## Set attributes from tuple + # + # @param tuple a tuple with (queryName,queryStart,queryEnd,subjectName,subjectStar,subjectEnd,E-value,score,identity) + # @note data are loaded such that the query is always on the direct strand + # + def setFromTuple( self, tuple ): + #TODO: we need to create Range instances because of __eq__() and isEmpty() tests, but WHY ??? + self.range_query = Range() + self.range_subject = Range() + if int(tuple[1]) < int(tuple[2]): + self.range_query.setFromTuple( ( tuple[0], tuple[1], tuple[2] ) ) + self.range_subject.setFromTuple( ( tuple[3], tuple[4], tuple[5] ) ) + else: + self.range_query.setFromTuple( ( tuple[0], tuple[2], tuple[1] ) ) + self.range_subject.setFromTuple( ( tuple[3], tuple[5], tuple[4] ) ) + self.e_value = float(tuple[6]) + self.score = float(tuple[7]) + self.identity = float(tuple[8]) + + ## Reset + # + def reset( self ): + self.range_query.reset() + self.range_subject.reset() + self.e_value = 0 + self.score = 0 + self.identity = 0 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % ( self.range_query.toString() ) + string += "\t%s" % ( self.range_subject.toString() ) + string += "\t%g\t%i\t%f" % ( self.e_value, self.score, self.identity ) + return string + + + ## Return the attributes as a GFF-formatted string + # + def toStringAsGff( self, source="REPET", type="match", phase=".", ID="", Parent="" ): + if not self.isSubjectOnDirectStrand(): + self.reverse() + string = "%s" % ( self.getQueryName() ) + string += "\t%s" % ( source ) + string += "\t%s" % ( type ) + string += "\t%s" % ( self.getQueryMin() ) + string += "\t%s" % ( self.getQueryMax() ) + string += "\t%g" % ( self.e_value ) + string += "\t%s" % ( self.getQueryStrand() ) + string += "\t%s" % ( phase ) + attributes = "" + if ID != "": + attributes += "ID=%s" % ( ID ) + else: + attributes += "ID=%i" % ( str(time.time())[-8:-1].replace(".","") ) + if Parent != "": + attributes += ";Parent=%s" % ( Parent ) + attributes += ";Target=%s %i %i" % ( self.getSubjectName(), self.getSubjectStart(), self.getSubjectEnd() ) + string += "\t%s" % ( attributes ) + return string + + + ## Reverse query and subject + # + def reverse(self): + self.range_query.reverse() + self.range_subject.reverse() + + ## Show the attributes + # + def show(self): + print self.toString() + + ## Write attributes into an Align file + # + # @param fileHandler: file handler of the file being filled + # + def write(self, fileHandler): + fileHandler.write("%s\n" % (self.toString())) + + ## Save attributes into an Align file + # + # @param file: name of the file being filled + # + def save(self, file): + fileHandler = open( file, "a" ) + self.write( fileHandler ) + fileHandler.close() + + ## Return the score + # + def getScore(self): + return self.score + + ## Return the identity + # + def getIdentity(self): + return self.identity + + def getEvalue(self): + return self.e_value + + ## Return the length on the query + # + def getLengthOnQuery(self): + return self.range_query.getLength() + + ## Return the name of the query + # + def getQueryName( self ): + return self.range_query.seqname + + ## Return the start of the query + # + def getQueryStart( self ): + return self.range_query.start + + ## Return the end of the query + # + def getQueryEnd( self ): + return self.range_query.end + + ## Return the min of the query + # + def getQueryMin( self ): + return self.range_query.getMin() + + ## Return the max of the query + # + def getQueryMax( self ): + return self.range_query.getMax() + + ## Return the strand of the query + # + def getQueryStrand( self ): + return self.range_query.getStrand() + + ## Return the length on the subject + # + def getLengthOnSubject(self): + return self.range_subject.getLength() + + ## Return the name of the subject + # + def getSubjectName( self ): + return self.range_subject.seqname + + ## Return the start of the subject + # + def getSubjectStart( self ): + return self.range_subject.start + + ## Return the end of the subject + # + def getSubjectEnd( self ): + return self.range_subject.end + + ## Return the min of the subject + # + def getSubjectMin( self ): + return self.range_subject.getMin() + + ## Return the max of the subject + # + def getSubjectMax( self ): + return self.range_subject.getMax() + + ## Return the strand of the subject + # + def getSubjectStrand( self ): + return self.range_subject.getStrand() + + ## Return the query as a Range instance + # + def getQueryAsRange( self ): + return self.range_query + + ## Return the subject as a Range instance + # + def getSubjectAsRange( self ): + return self.range_subject + + ## Set the name of the query + # + def setQueryName( self, name ): + self.range_query.seqname = name + + ## Set the start of the query + # + def setQueryStart( self, start ): + self.range_query.start = start + + ## Set the end of the query + # + def setQueryEnd( self, end ): + self.range_query.end = end + + ## Set the name of the subject + # + def setSubjectName( self, name ): + self.range_subject.seqname = name + + ## Set the start of the subject + # + def setSubjectStart( self, start ): + self.range_subject.start = start + + ## Set the end of the subject + # + def setSubjectEnd( self, end ): + self.range_subject.end = end + + ## Merge the instance with another Align instance + # + # @param o an Align instance + # + def merge(self, o): + if self.range_query.seqname != o.range_query.seqname \ + or self.range_subject.seqname != o.range_subject.seqname: + return + self.range_query.merge(o.range_query) + self.range_subject.merge(o.range_subject) + self.score = max(self.score,o.score) + self.e_value = min(self.e_value,o.e_value) + self.identity = max(self.identity,o.identity) + + ## Return a Map instance with the subject mapped on the query + # + def getSubjectAsMapOfQuery(self): + iMap = Map() + iMap.name = self.range_subject.seqname + iMap.seqname = self.range_query.seqname + if self.range_subject.isOnDirectStrand(): + iMap.start = self.range_query.start + iMap.end = self.range_query.end + else: + iMap.start = self.range_query.end + iMap.end = self.range_query.start + return iMap + + ## Return True if query is on direct strand + # + def isQueryOnDirectStrand( self ): + return self.range_query.isOnDirectStrand() + + ## Return True if subject is on direct strand + # + def isSubjectOnDirectStrand( self ): + return self.range_subject.isOnDirectStrand() + + ## Return True if query and subject are on the same strand, False otherwise + # + def areQrySbjOnSameStrand(self): + return self.isQueryOnDirectStrand() == self.isSubjectOnDirectStrand() + + ## Return False if query and subject are on the same strand, True otherwise + # + def areQrySbjOnOppositeStrands(self): + return not self.areQrySbjOnSameStrand() + + ## Set attributes from string + # + # @param string a string formatted like queryName queryStart queryEnd subjectName subjectStart subjectEnd E-value score identity + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + if string[-1] == "\n": + string = string[:-1] + self.setFromTuple( string.split(sep) ) + + ## Return a first Map instance for the query and a second for the subject + # + def getMapsOfQueryAndSubject(self): + iMapQuery = Map( name="repet", + seqname=self.range_query.seqname, + start=self.range_query.start, + end=self.range_query.end ) + iMapSubject = Map( name="repet", + seqname=self.range_subject.seqname, + start=self.range_subject.start, + end=self.range_subject.end ) + return iMapQuery, iMapSubject + + ## Write query coordinates as Map in a file + # + # @param fileHandler: file handler of the file being filled + # + def writeSubjectAsMapOfQuery( self, fileHandler ): + m = self.getSubjectAsMapOfQuery() + m.write( fileHandler ) + + ## Return a bin for fast database access + # + def getBin(self): + return self.range_query.getBin() + + ## Switch query and subject + # + def switchQuerySubject( self ): + tmpRange = self.range_query + self.range_query = self.range_subject + self.range_subject = tmpRange + if not self.isQueryOnDirectStrand(): + self.reverse() + + ## Return True if the query overlaps with the query of another Align instance, False otherwise + # + def isQueryOverlapping( self, iAlign ): + return self.getQueryAsRange().isOverlapping( iAlign.getQueryAsRange() ) + + ## Return True if the subject overlaps with the subject of another Align instance, False otherwise + # + def isSubjectOverlapping( self, iAlign ): + return self.getSubjectAsRange().isOverlapping( iAlign.getSubjectAsRange() ) + + ## Return True if the Align instance overlaps with another Align instance, False otherwise + # + def isOverlapping( self, iAlign ): + if self.isQueryOverlapping( iAlign ) and self.isSubjectOverlapping( iAlign ): + return True + else: + return False + + ## Update the score + # + # @note the new score is the length on the query times the percentage of identity + # + def updateScore( self ): + newScore = self.getLengthOnQuery() * self.getIdentity() / 100.0 + self.score = newScore diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/AlignUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/AlignUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,359 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import shutil +from commons.core.coord.Align import Align + + +## Static methods manipulating Align instances +# +class AlignUtils( object ): + + ## Return a list with Align instances from the given file + # + # @param inFile name of a file in the Align format + # + def getAlignListFromFile( inFile ): + lAlignInstances = [] + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + a = Align() + a.setFromString( line ) + lAlignInstances.append( a ) + inFileHandler.close() + return lAlignInstances + + getAlignListFromFile = staticmethod( getAlignListFromFile ) + + + ## Return a list with all the scores + # + # @param lAlignInstances: list of Align instances + # + def getListOfScores( lAlignInstances ): + lScores = [] + for iAlign in lAlignInstances: + lScores.append( iAlign.score ) + return lScores + + getListOfScores = staticmethod( getListOfScores ) + + + ## Return a list with all the scores from the given file + # + # @param inFile name of a file in the Align format + # + def getScoreListFromFile(inFile): + lScores = [] + append = lScores.append + with open(inFile, "r") as inFileHandler: + line = inFileHandler.readline() + while line: + if line != "\n": + append(int(line.split('\t')[7])) + line = inFileHandler.readline() + return lScores + + getScoreListFromFile = staticmethod( getScoreListFromFile ) + + + ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file + # + # @param alignFile: name of the input Align file + # @param mapFile: name of the output Map file + # + def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ): + alignFileHandler = open( alignFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iAlign = Align() + while True: + line = alignFileHandler.readline() + if line == "": + break + iAlign.setFromString( line ) + iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject() + iMapQ.write( mapFileHandler ) + iMapS.write( mapFileHandler ) + alignFileHandler.close() + mapFileHandler.close() + + convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects ) + + + ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file + # + # @param alignFile: name of the input Align file + # @param mapFile: name of the output Map file + # + def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ): + alignFileHandler = open( alignFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iAlign = Align() + while True: + line = alignFileHandler.readline() + if line == "": + break + iAlign.setFromString( line ) + iMapQ = iAlign.getSubjectAsMapOfQuery() + iMapQ.write( mapFileHandler ) + alignFileHandler.close() + mapFileHandler.close() + + convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries ) + + + ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order + # + # @param lAligns: list of Align instances + # + def getAlignListSortedByDecreasingScoreThenLength( lAligns ): + return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) ) + + getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength ) + + + ## Convert an Align file into a Path file + # + # @param alignFile string name of the input Align file + # @param pathFile string name of the output Path file + # + def convertAlignFileIntoPathFile( alignFile, pathFile ): + alignFileHandler = open( alignFile, "r" ) + pathFileHandler = open( pathFile, "w" ) + iAlign = Align() + countAlign = 0 + while True: + line = alignFileHandler.readline() + if line == "": + break + countAlign += 1 + iAlign.setFromString( line, "\t" ) + pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) ) + alignFileHandler.close() + pathFileHandler.close() + + convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile ) + + + ## Sort an Align file + # + def sortAlignFile( inFile, outFile="" ): + if outFile == "": + outFile = "%s.sort" % ( inFile ) + prg = "sort" + cmd = prg + cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n" + cmd += " %s" % ( inFile ) + cmd += " > %s" % ( outFile ) + exitStatus = os.system( cmd ) + if exitStatus != 0: + msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus ) + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit( exitStatus ) + + sortAlignFile = staticmethod( sortAlignFile ) + + + ## Write Align instances contained in the given list + # + # @param lAlign a list of Align instances + # @param fileName name of the file to write the Align instances + # @param mode the open mode of the file ""w"" or ""a"" + # + def writeListInFile( lAlign, fileName, mode="w" ): + fileHandler = open( fileName, mode ) + for iAlign in lAlign: + iAlign.write( fileHandler ) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + + ## Split a list of Align instances according to the name of the query + # + # @param lInAlign list of align instances + # @return lOutAlignList list of align instances lists + # + def splitAlignListByQueryName( lInAlign ): + lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname) + lOutAlignList = [] + if len(lSortedAlign) != 0 : + lAlignForCurrentQuery = [] + previousQuery = lSortedAlign[0].range_query.seqname + for align in lSortedAlign : + currentQuery = align.range_query.seqname + if previousQuery != currentQuery : + lOutAlignList.append(lAlignForCurrentQuery) + previousQuery = currentQuery + lAlignForCurrentQuery = [] + lAlignForCurrentQuery.append(align) + + lOutAlignList.append(lAlignForCurrentQuery) + + return lOutAlignList + + splitAlignListByQueryName = staticmethod( splitAlignListByQueryName ) + + + ## Create an Align file from each list of Align instances in the input list + # + # @param lAlignList list of lists with Align instances + # @param pattern string + # @param dirName string + # + def createAlignFiles( lAlignList, pattern, dirName="" ): + savedDir = os.getcwd() + nbFiles = len(lAlignList) + countFile = 1 + if dirName != "" : + try: + os.makedirs(dirName) + except: + pass + os.chdir(dirName) + + for lAlign in lAlignList: + fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles)))) + AlignUtils.writeListInFile(lAlign, fileName) + countFile += 1 + os.chdir(savedDir) + + createAlignFiles = staticmethod( createAlignFiles ) + + + ## Return a list with Align instances sorted by query name, subject name, query start, query end and score + # + def sortList( lAligns ): + return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(), + iAlign.getSubjectName(), + iAlign.getQueryStart(), + iAlign.getQueryEnd(), + iAlign.getScore() ) ) + + sortList = staticmethod( sortList ) + + + ## Return a list after merging all overlapping Align instances + # + def mergeList( lAligns ): + lMerged = [] + + lSorted = AlignUtils.sortList( lAligns ) + + prev_count = 0 + for iAlign in lSorted: + if prev_count != len(lSorted): + for i in lSorted[ prev_count + 1: ]: + if iAlign.isOverlapping( i ): + iAlign.merge( i ) + IsAlreadyInList = False + for newAlign in lMerged: + if newAlign.isOverlapping( iAlign ): + IsAlreadyInList = True + newAlign.merge( iAlign ) + lMerged [ lMerged.index( newAlign ) ] = newAlign + if not IsAlreadyInList: + lMerged.append( iAlign ) + prev_count += 1 + + return lMerged + + mergeList = staticmethod( mergeList ) + + + ## Merge all Align instance in a given Align file + # + def mergeFile( inFile, outFile="" ): + if outFile == "": + outFile = "%s.merged" % ( inFile ) + if os.path.exists( outFile ): + os.remove( outFile ) + + tmpFile = "%s.sorted" % ( inFile ) + AlignUtils.sortAlignFile( inFile, tmpFile ) + + tmpF = open( tmpFile, "r" ) + dQrySbj2Aligns = {} + prevPairQrySbj = "" + while True: + line = tmpF.readline() + if line == "": + break + iAlign = Align() + iAlign.setFromString( line ) + pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() ) + if not dQrySbj2Aligns.has_key( pairQrySbj ): + if prevPairQrySbj != "": + lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) + AlignUtils.writeListInFile( lMerged, outFile, "a" ) + del dQrySbj2Aligns[ prevPairQrySbj ] + prevPairQrySbj = pairQrySbj + else: + prevPairQrySbj = pairQrySbj + dQrySbj2Aligns[ pairQrySbj ] = [] + dQrySbj2Aligns[ pairQrySbj ].append( iAlign ) + lMerged = [] + if len(dQrySbj2Aligns.keys()) > 0: + lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) + AlignUtils.writeListInFile( lMerged, outFile, "a" ) + tmpF.close() + os.remove( tmpFile ) + + mergeFile = staticmethod( mergeFile ) + + + ## Update the scores of each match in the input file + # + # @note the new score is the length on the query times the percentage of identity + # + def updateScoresInFile( inFile, outFile ): + inHandler = open( inFile, "r" ) + outHandler = open( outFile, "w" ) + iAlign = Align() + + while True: + line = inHandler.readline() + if line == "": + break + iAlign.reset() + iAlign.setFromString( line, "\t" ) + iAlign.updateScore() + iAlign.write( outHandler ) + + inHandler.close() + outHandler.close() + + updateScoresInFile = staticmethod( updateScoresInFile ) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/ConvCoord.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/ConvCoord.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,504 @@ +#!/usr/bin/env python + +##@file +# Convert coordinates from chunks to chromosomes or the opposite. +# +# usage: ConvCoord.py [ options ] +# options: +# -h: this help +# -i: input data with coordinates to convert (file or table) +# -f: input data format (default='align'/'path') +# -c: coordinates to convert (query, subject or both; default='q'/'s'/'qs') +# -m: mapping of chunks on chromosomes (format='map') +# -x: convert from chromosomes to chunks (opposite by default) +# -o: output data (file or table, same as input) +# -C: configuration file (for database connection) +# -v: verbosity level (default=0/1/2) + + +import os +import sys +import getopt +import time +from commons.core.sql.DbFactory import DbFactory +from commons.core.coord.MapUtils import MapUtils +from commons.core.sql.TableMapAdaptator import TableMapAdaptator +from commons.core.sql.TablePathAdaptator import TablePathAdaptator +from commons.core.coord.PathUtils import PathUtils +from commons.core.coord.Align import Align +from commons.core.coord.Path import Path +from commons.core.coord.Range import Range + + +## Class to handle coordinate conversion +# +class ConvCoord( object ): + + ## Constructor + # + def __init__( self, inData="", mapData="", outData="", configFile="", verbosity=0): + self._inData = inData + self._formatInData = "align" + self._coordToConvert = "q" + self._mapData = mapData + self._mergeChunkOverlaps = True + self._convertChunks = True + self._outData = outData + self._configFile = configFile + self._verbose = verbosity + self._typeInData = "file" + self._typeMapData = "file" + self._tpa = None + if self._configFile != "" and os.path.exists(self._configFile): + self._iDb = DbFactory.createInstance(self._configFile) + else: + self._iDb = DbFactory.createInstance() + + + ## Display the help on stdout + # + def help( self ): + print + print "usage: ConvCoord.py [ options ]" + print "options:" + print " -h: this help" + print " -i: input data with coordinates to convert (file or table)" + print " -f: input data format (default='align'/'path')" + print " -c: coordinates to convert (query, subject or both; default='q'/'s'/'qs')" + print " -m: mapping of chunks on chromosomes (format='map')" + print " -M: merge chunk overlaps (default=yes/no)" + print " -x: convert from chromosomes to chunks (opposite by default)" + print " -o: output data (file or table, same as input)" + print " -C: configuration file (for database connection)" + print " -v: verbosity level (default=0/1/2)" + print + + + ## Set the attributes from the command-line + # + def setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:f:c:m:M:xo:C:v:") + except getopt.GetoptError, err: + sys.stderr.write( "%s\n" % ( str(err) ) ) + self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self.setInputData( a ) + elif o == "-f": + self.setInputFormat( a ) + elif o == "-c": + self.setCoordinatesToConvert( a ) + elif o == "-m": + self.setMapData( a ) + elif o == "-M": + self.setMergeChunkOverlaps( a ) + elif o == "-o": + self.setOutputData( a ) + elif o == "-C": + self.setConfigFile( a ) + elif o == "-v": + self.setVerbosityLevel( a ) + + + def setInputData( self, inData ): + self._inData = inData + + def setInputFormat( self, formatInData ): + self._formatInData = formatInData + + def setCoordinatesToConvert( self, coordToConvert ): + self._coordToConvert = coordToConvert + + def setMapData( self, mapData ): + self._mapData = mapData + + def setMergeChunkOverlaps( self, mergeChunkOverlaps ): + if mergeChunkOverlaps == "yes": + self._mergeChunkOverlaps = True + else: + self._mergeChunkOverlaps = False + + def setOutputData( self, outData ): + self._outData = outData + + def setConfigFile( self, configFile ): + self._configFile = configFile + + def setVerbosityLevel( self, verbose ): + self._verbose = int(verbose) + + + ## Check the attributes are valid before running the algorithm + # + def checkAttributes( self ): + if self._inData == "": + msg = "ERROR: missing input data (-i)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._formatInData not in ["align","path"]: + msg = "ERROR: unrecognized format '%s' (-f)" % ( self._formatInData ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._configFile == "": + self._iDb = DbFactory.createInstance() + elif not os.path.exists( self._configFile ): + msg = "ERROR: configuration file '%s' doesn't exist" % ( self._configFile ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + else: + self._iDb = DbFactory.createInstance(self._configFile) + if not os.path.exists( self._inData ) and not self._iDb.doesTableExist( self._inData ): + msg = "ERROR: input data '%s' doesn't exist" % ( self._inData ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if os.path.exists( self._inData ): + self._typeInData = "file" + elif self._iDb.doesTableExist( self._inData ): + self._typeInData = "table" + if self._coordToConvert == "": + msg = "ERROR: missing coordinates to convert (-c)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._coordToConvert not in [ "q", "s", "qs" ]: + msg = "ERROR: unrecognized coordinates to convert '%s' (-c)" % ( self._coordToConvert ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if self._mapData == "": + msg = "ERROR: missing mapping coordinates of chunks on chromosomes (-m)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if not os.path.exists( self._mapData ) and not self._iDb.doesTableExist( self._mapData ): + msg = "ERROR: mapping data '%s' doesn't exist" % ( self._mapData ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help(); sys.exit(1) + if os.path.exists( self._mapData ): + self._typeMapData = "file" + elif self._iDb.doesTableExist( self._mapData ): + self._typeMapData = "table" + if self._outData == "": + if self._convertChunks: + self._outData = "%s.onChr" % ( self._inData ) + else: + self._outData = "%s.onChk" % ( self._inData ) + if self._typeInData == "table": + self._outData = self._outData.replace(".","_") + + + ## Return a dictionary with the mapping of the chunks on the chromosomes + # + def getChunkCoordsOnChromosomes( self ): + if self._typeMapData == "file": + dChunks2CoordMaps = MapUtils.getDictPerNameFromMapFile( self._mapData ) + elif self._typeMapData == "table": + tma = TableMapAdaptator( self._iDb, self._mapData ) + dChunks2CoordMaps = tma.getDictPerName() + if self._verbose > 0: + msg = "nb of chunks: %i" % ( len(dChunks2CoordMaps.keys()) ) + sys.stdout.write( "%s\n" % ( msg ) ) + return dChunks2CoordMaps + + + def getRangeOnChromosome( self, chkRange, dChunks2CoordMaps ): + chrRange = Range() + chunkName = chkRange.seqname + chrRange.seqname = dChunks2CoordMaps[ chunkName ].seqname + if dChunks2CoordMaps[ chunkName ].start == 1: + chrRange.start = chkRange.start + chrRange.end = chkRange.end + else: + startOfChkOnChr = dChunks2CoordMaps[ chunkName ].start + chrRange.start = startOfChkOnChr + chkRange.start - 1 + chrRange.end = startOfChkOnChr + chkRange.end - 1 + return chrRange + + + def convCoordsChkToChrFromAlignFile( self, inFile, dChunks2CoordMaps ): + return self.convCoordsChkToChrFromAlignOrPathFile( inFile, dChunks2CoordMaps, "align" ) + + + def convCoordsChkToChrFromPathFile( self, inFile, dChunks2CoordMaps ): + return self.convCoordsChkToChrFromAlignOrPathFile( inFile, dChunks2CoordMaps, "path" ) + + + + ## Convert coordinates of a Path or Align file from chunks to chromosomes + # + def convCoordsChkToChrFromAlignOrPathFile( self, inFile, dChunks2CoordMaps, format ): + if self._verbose > 0: + msg = "start method 'convCoordsChkToChrFromAlignOrPathFile'" + sys.stdout.write( "%s\n" % ( msg ) ) + outFile = "%s.tmp" % ( inFile ) + inFileHandler = open( inFile, "r" ) + outFileHandler = open( outFile, "w" ) + if format == "align": + iObject = Align() + else: + iObject = Path() + countLine = 0 + + while True: + line = inFileHandler.readline() + if line == "": + break + countLine += 1 + iObject.setFromString( line ) + if self._coordToConvert in [ "q", "qs" ]: + queryOnChr = self.getRangeOnChromosome( iObject.range_query, dChunks2CoordMaps ) + iObject.range_query = queryOnChr + if self._coordToConvert in [ "s", "qs" ]: + subjectOnChr = self.getRangeOnChromosome( iObject.range_subject, dChunks2CoordMaps ) + iObject.range_subject = subjectOnChr + iObject.write( outFileHandler ) + iObject.reset() + + inFileHandler.close() + outFileHandler.close() + if self._verbose > 0: + msg = "end method 'convCoordsChkToChrFromAlignOrPathFile'" + sys.stdout.write( "%s\n" % ( msg ) ) + return outFile + + ## Convert coordinates of a file from chunks to chromosomes + # + def convCoordsChkToChrFromFile( self, inFile, format, dChunks2CoordMaps ): + if self._verbose > 0: + msg = "start convCoordsChkToChrFromFile" + sys.stdout.write( "%s\n" % ( msg ) ) + if format == "align": + tmpAlignFile = self.convCoordsChkToChrFromAlignFile( inFile, dChunks2CoordMaps ) + tmpAlignTable = tmpAlignFile.replace(".","_").replace("-","_") + self._iDb.createTable( tmpAlignTable, "align", tmpAlignFile, True) + os.remove( tmpAlignFile ) + self._iDb.removeDoublons( tmpAlignTable ) + outTable = "%s_path" % ( tmpAlignTable ) + self._iDb.convertAlignTableIntoPathTable( tmpAlignTable, outTable ) + self._iDb.dropTable( tmpAlignTable ) + elif format == "path": + tmpPathFile = self.convCoordsChkToChrFromPathFile( inFile, dChunks2CoordMaps ) + outTable = tmpPathFile.replace(".","_").replace("-","_") + self._iDb.createTable( outTable, "path", tmpPathFile, True) + os.remove( tmpPathFile ) + if self._verbose > 0: + msg = "end convCoordsChkToChrFromFile" + sys.stdout.write( "%s\n" % ( msg ) ) + return outTable + + + ## Convert coordinates of a table from chunks to chromosomes + # + def convCoordsChkToChrFromTable( self, inTable, format, dChunks2CoordMaps ): + tmpFile = inTable + self._iDb.exportDataToFile( inTable, tmpFile, False ) + outTable = self.convCoordsChkToChrFromFile( tmpFile, format, dChunks2CoordMaps ) + os.remove( tmpFile ) + return outTable + + + def getListsDirectAndReversePaths( self, lPaths ): + lDirectPaths = [] + lReversePaths = [] + for iPath in lPaths: + if iPath.isQueryOnDirectStrand() and iPath.isSubjectOnDirectStrand(): + lDirectPaths.append( iPath ) + else: + lReversePaths.append( iPath ) + return lDirectPaths, lReversePaths + + + def mergePaths( self, lPaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ): + if len(lPaths) < 2: + lIdsToInsert.append( lPaths[0].id ) + return + i = 0 + while i < len(lPaths) - 1: + i += 1 + if self._verbose > 1 and i==1 : + print lPaths[i-1] + if self._verbose > 1: + print lPaths[i] + sys.stdout.flush() + idPrev = lPaths[i-1].id + idNext = lPaths[i].id + if lPaths[i-1].canMerge( lPaths[i] ): + dOldIdToNewId[ idNext ] = idPrev + if idPrev not in lIdsToInsert: + lIdsToInsert.append( idPrev ) + if idNext not in lIdsToDelete: + lIdsToDelete.append( idNext ) + lPaths[i-1].merge( lPaths[i] ) + del lPaths[i] + i -= 1 + + + def insertPaths( self, lPaths, lIdsToInsert, dOldIdToNewId ): + for iPath in lPaths: + if dOldIdToNewId.has_key( iPath.id ): + iPath.id = dOldIdToNewId[ iPath.id ] + if iPath.id in lIdsToInsert: + self._tpa.insert( iPath ) + + + ## Merge Path instances in a Path table when they correspond to chunk overlaps + # + def mergeCoordsOnChunkOverlaps( self, dChunks2CoordMaps, tmpPathTable ): + if self._verbose > 0: + msg = "start method 'mergeCoordsOnChunkOverlaps'" + sys.stdout.write( "%s\n" % ( msg ) ) + self._tpa = TablePathAdaptator( self._iDb, tmpPathTable ) + nbChunks = len(dChunks2CoordMaps.keys()) + for numChunk in range(1,nbChunks): + chunkName1 = "chunk%s" % ( str(numChunk).zfill( len(str(nbChunks)) ) ) + chunkName2 = "chunk%s" % ( str(numChunk+1).zfill( len(str(nbChunks)) ) ) + if not dChunks2CoordMaps.has_key( chunkName2 ): + break + if self._verbose > 1: + msg = "try merge on '%s' and '%s'" % ( chunkName1, chunkName2 ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + chrName = dChunks2CoordMaps[ chunkName1 ].seqname + if dChunks2CoordMaps[ chunkName2 ].seqname != chrName: + if self._verbose > 1: + msg = "not on same chromosome (%s != %s)" % ( dChunks2CoordMaps[ chunkName2 ].seqname, chrName ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + continue + minCoord = min( dChunks2CoordMaps[ chunkName1 ].end, dChunks2CoordMaps[ chunkName2 ].start ) + maxCoord = max( dChunks2CoordMaps[ chunkName1 ].end, dChunks2CoordMaps[ chunkName2 ].start ) + lPaths = self._tpa.getChainListOverlappingQueryCoord( chrName, minCoord, maxCoord ) + if len(lPaths) == 0: + if self._verbose > 1: + msg = "no overlapping matches on %s (%i->%i)" % ( chrName, minCoord, maxCoord ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + continue + if self._verbose > 1: + msg = "%i overlapping matche(s) on %s (%i->%i)" % ( len(lPaths), chrName, minCoord, maxCoord ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) + lDirectPaths, lReversePaths = self.getListsDirectAndReversePaths( lSortedPaths ) + lIdsToInsert = [] + lIdsToDelete = [] + dOldIdToNewId = {} + if len(lDirectPaths) > 0: + self.mergePaths( lDirectPaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ) + if len(lReversePaths) > 0: + self.mergePaths( lReversePaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ) + self._tpa.deleteFromIdList( lIdsToDelete ) + self._tpa.deleteFromIdList( lIdsToInsert ) + self.insertPaths( lDirectPaths, lIdsToInsert, dOldIdToNewId ) + self.insertPaths( lReversePaths, lIdsToInsert, dOldIdToNewId ) + if self._verbose > 0: + msg = "end method 'mergeCoordsOnChunkOverlaps'" + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + + + def saveChrCoordsAsFile( self, tmpPathTable, outFile ): + self._iDb.exportDataToFile( tmpPathTable, tmpPathTable, False ) + self._iDb.dropTable( tmpPathTable ) + if self._formatInData == "align": + PathUtils.convertPathFileIntoAlignFile( tmpPathTable, outFile ) + os.remove( tmpPathTable ) + elif self._formatInData == "path": + os.rename( tmpPathTable, outFile ) + + + def saveChrCoordsAsTable( self, tmpPathTable, outTable ): + if self._formatInData == "align": + self._iDb.convertPathTableIntoAlignTable( tmpPathTable, outTable ) + self._iDb.dropTable( tmpPathTable ) + elif self._formatInData == "path": + self._iDb.renameTable( tmpPathTable, outTable ) + + + ## Convert coordinates from chunks to chromosomes + # + def convertCoordinatesFromChunksToChromosomes( self ): + dChunks2CoordMaps = self.getChunkCoordsOnChromosomes() + + if self._typeInData == "file": + tmpPathTable = self.convCoordsChkToChrFromFile( self._inData, self._formatInData, dChunks2CoordMaps ) + elif self._typeInData == "table": + tmpPathTable = self.convCoordsChkToChrFromTable( self._inData, self._formatInData, dChunks2CoordMaps ) + + if self._mergeChunkOverlaps: + self.mergeCoordsOnChunkOverlaps( dChunks2CoordMaps, tmpPathTable ); + + if self._typeInData == "file": + self.saveChrCoordsAsFile( tmpPathTable, self._outData ) + elif self._typeInData == "table": + self.saveChrCoordsAsTable( tmpPathTable, self._outData ) + + + ## Convert coordinates from chromosomes to chunks + # + def convertCoordinatesFromChromosomesToChunks( self ): + msg = "ERROR: convert coordinates from chromosomes to chunks not yet available" + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + + + ## Useful commands before running the program + # + def start( self ): + self.checkAttributes() + if self._verbose > 0: + msg = "START ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") ) + msg += "\ninput data: %s" % ( self._inData ) + if self._typeInData == "file": + msg += " (file)\n" + else: + msg += " (table)\n" + msg += "format: %s\n" % ( self._formatInData ) + msg += "coordinates to convert: %s\n" % ( self._coordToConvert ) + msg += "mapping data: %s" % ( self._mapData ) + if self._typeMapData == "file": + msg += " (file)\n" + else: + msg += " (table)\n" + if self._mergeChunkOverlaps: + msg += "merge chunk overlaps\n" + else: + msg += "don't merge chunk overlaps\n" + if self._convertChunks: + msg += "convert chunks to chromosomes\n" + else: + msg += "convert chromosomes to chunks\n" + msg += "output data: %s" % ( self._outData ) + if self._typeInData == "file": + msg += " (file)\n" + else: + msg += " (table)\n" + sys.stdout.write( msg ) + + + ## Useful commands before ending the program + # + def end( self ): + self._iDb.close() + if self._verbose > 0: + msg = "END ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") ) + sys.stdout.write( "%s\n" % ( msg ) ) + + + ## Run the program + # + def run( self ): + self.start() + + if self._convertChunks: + self.convertCoordinatesFromChunksToChromosomes() + else: + self.convertCoordinatesFromChromosomesToChunks() + + self.end() + + +if __name__ == "__main__": + i = ConvCoord() + i.setAttributesFromCmdLine() + i.run() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/Map.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Map.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,161 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Range import Range + + +## Record a named region on a given sequence +# +class Map( Range ): + + ## Constructor + # + # @param name the name of the region + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, name="", seqname="", start=-1, end=-1): + self.name = name + Range.__init__( self, seqname, start, end ) + + ## Equal operator + # + # @param o a Map instance + # + def __eq__(self, o): + if self.name == o.name: + return Range.__eq__(self, o) + return False + + ## Return name + # + def getName( self ): + return self.name + + ## Set attributes from tuple + # + # @param tuple: a tuple with (name,seqname,start,end) + # + def setFromTuple(self, tuple): + self.name = tuple[0] + Range.setFromTuple(self, tuple[1:]) + + ## Set attributes from string + # + # @param string a string formatted like nameseqnamestartend + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + if string[-1] == "\n": + string = string[:-1] + self.setFromTuple( string.split(sep) ) + + ## Reset + # + def reset(self): + self.setFromTuple( [ "", "", -1, -1 ] ) + + ## Read attributes from a Map file + # + # @param fileHandler: file handler of the file being read + # @return: 1 on success, 0 at the end of the file + # + def read(self, fileHandler): + self.reset() + line = fileHandler.readline() + if line == "": + return 0 + tokens = line.split("\t") + if len(tokens) < len(self.__dict__.keys()): + return 0 + self.setFromTuple(tokens) + return 1 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % (self.name) + string += "\t%s" % (Range.toString(self)) + return string + + ## Write attributes into a Map file + # + # @param fileHandler: file handler of the file being filled + # + def write(self, fileHandler): + fileHandler.write("%s\n" % (self.toString())) + + ## Save attributes into a Map file + # + # @param file: name of the file being filled + # + def save(self, file): + fileHandler = open( file, "a" ) + self.write( fileHandler ) + fileHandler.close() + + ## Return a Range instance with the attributes + # + def getRange(self): + return Range( self.seqname, self.start, self.end) + + ## Remove in the instance the region overlapping with another Map instance + # + # @param o a Map instance + # + def diff(self, o): + iRange = Range.diff(self, o.getRange()) + new = Map() + if not iRange.isEmpty(): + new.name = self.name + new.seqname = self.seqname + new.start = iRange.start + new.end = iRange.end + return new + + ## Write attributes in a Path file, the name being the subject and the rest the Range query + # + # @param fileHandler: file handler of a Path file + # + def writeAsQueryOfPath(self, fileHandler): + string = "0" + string += "\t%s" % ( self.seqname ) + string += "\t%i" % ( self.getMin() ) + string += "\t%i" % ( self.getMax() ) + string += "\t%s" % ( self.name ) + string += "\t0" + string += "\t0" + string += "\t0.0" + string += "\t0" + string += "\t0" + fileHandler.write( "%s\n" % ( string ) ) + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/MapUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MapUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,246 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +import os +from commons.core.coord.Map import Map +from commons.core.coord.Set import Set +try: + from commons.core.checker.CheckerUtils import CheckerUtils +except ImportError: + pass + + +## static methods manipulating Map instances +# +class MapUtils( object ): + + ## Return a list of Map instances sorted in increasing order according to the min, then the max, and finally their initial order + # + # @param lMaps list of Map instances + # + def getMapListSortedByIncreasingMinThenMax( lMaps ): + return sorted( lMaps, key=lambda iMap: ( iMap.getMin(), iMap.getMax() ) ) + + getMapListSortedByIncreasingMinThenMax = staticmethod( getMapListSortedByIncreasingMinThenMax ) + + + ## Return a list of Map instances sorted in increasing order according to the name, then the seqname, then the min, then the max + # + # @param lMaps list of Map instances + # + def getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax( lMaps ): + return sorted( lMaps, key=lambda iMap: ( iMap.getName(), iMap.getSeqname(), iMap.getMin(), iMap.getMax() ) ) + + getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax = staticmethod( getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax ) + + + ## Return a dictionary which keys are Map names and values the corresponding Map instances + # + def getDictPerNameFromMapFile( mapFile ): + dName2Maps = {} + mapFileHandler = open( mapFile, "r" ) + while True: + line = mapFileHandler.readline() + if line == "": + break + iMap = Map() + iMap.setFromString( line, "\t" ) + if dName2Maps.has_key( iMap.name ): + if iMap == dName2Maps[ iMap.name ]: + continue + else: + msg = "ERROR: in file '%s' two different Map instances have the same name '%s'" % ( mapFile, iMap.name ) + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + dName2Maps[ iMap.name ] = iMap + mapFileHandler.close() + return dName2Maps + + getDictPerNameFromMapFile = staticmethod( getDictPerNameFromMapFile ) + + + ## Give a list of Set instances from a list of Map instances + # + # @param lMaps list of Map instances + # @return lSets list of Set instances + # + def mapList2SetList( lMaps ): + lSets = [] + c = 0 + for iMap in lMaps: + c += 1 + iSet = Set() + iSet.id = c + iSet.name = iMap.getName() + iSet.seqname = iMap.getSeqname() + iSet.start = iMap.getStart() + iSet.end = iMap.getEnd() + lSets.append( iSet ) + return lSets + + mapList2SetList = staticmethod( mapList2SetList ) + + + ## Merge the Map instances in a Map file using 'mapOp' + # + def mergeCoordsInFile( inFile, outFile ): + if not sys.modules.has_key( "commons.core.checker.CheckerUtils" ): + msg = "WARNING: can't find module 'CheckerUtils'" + sys.stderr.write( "%s\n" % msg ) + elif not CheckerUtils.isExecutableInUserPath( "mapOp" ): + msg = "WARNING: can't find executable 'mapOp'" + sys.stderr.write( "%s\n" % msg ) + else: + cmd = "mapOp" + cmd += " -q %s" % ( inFile ) + cmd += " -m" + cmd += " 2>&1 > /dev/null" + returnStatus = os.system( cmd ) + if returnStatus != 0: + print "ERROR: mapOp returned %i" % ( returnStatus ) + sys.exit(1) + os.rename( "%s.merge" % inFile, + outFile ) + + mergeCoordsInFile = staticmethod( mergeCoordsInFile ) + + + ## Return a dictionary which keys are Map seqnames and values the corresponding Map instances + # + def getDictPerSeqNameFromMapFile( mapFile ): + dSeqName2Maps = {} + mapFileHandler = open( mapFile, "r" ) + while True: + line = mapFileHandler.readline() + if line == "": + break + iMap = Map() + iMap.setFromString( line, "\t" ) + if not dSeqName2Maps.has_key( iMap.seqname ): + dSeqName2Maps[ iMap.seqname ] = [] + dSeqName2Maps[ iMap.seqname ].append( iMap ) + mapFileHandler.close() + return dSeqName2Maps + + getDictPerSeqNameFromMapFile = staticmethod( getDictPerSeqNameFromMapFile ) + + + ## Convert an Map file into a Set file + # + # @param mapFile string input map file name + # @param setFile string output set file name + # + def convertMapFileIntoSetFile( mapFileName, setFileName = "" ): + if setFileName == "": + setFileName = "%s.set" % mapFileName + mapFileHandler = open( mapFileName, "r" ) + setFileHandler = open( setFileName, "w" ) + iMap = Map() + count = 0 + while True: + line = mapFileHandler.readline() + if line == "": + break + iMap.setFromString(line) + count += 1 + iSet = Set() + iSet.id = count + iSet.name = iMap.getName() + iSet.seqname = iMap.getSeqname() + iSet.start = iMap.getStart() + iSet.end = iMap.getEnd() + iSet.write(setFileHandler) + mapFileHandler.close() + setFileHandler.close() + + convertMapFileIntoSetFile = staticmethod( convertMapFileIntoSetFile ) + + ## Write Map instances contained in the given list + # + # @param lMaps list of Map instances + # @param fileName a file name + # @param mode the open mode of the file '"w"' or '"a"' + # + def writeListInFile(lMaps, fileName, mode="w"): + fileHandler = open(fileName, mode) + for iMap in lMaps: + iMap.write(fileHandler) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + + ## Get the length of the shorter seq in map file + # + # @param mapFileName + # @param mode the open mode of the file '"w"' or '"a"' + # + def getMinLengthOfMapFile(self, mapFileName): + fileHandler = open(mapFileName, "r") + line = fileHandler.readline() + start = int (line.split('\t')[2]) + end = int (line.split('\t')[3]) + min = end - start + 1 + while True: + line = fileHandler.readline() + if line == "": + break + start = int (line.split('\t')[2]) + end = int (line.split('\t')[3]) + currentMin = end - start + 1 + if min >= currentMin: + min = currentMin + fileHandler.close() + return min + + ## Get the max length of the shorter seq in map file + # + # @param mapFileName + # @param mode the open mode of the file '"w"' or '"a"' + # + def getMaxLengthOfMapFile(self, mapFileName): + fileHandler = open(mapFileName, "r") + line = fileHandler.readline() + start = int (line.split('\t')[2]) + end = int (line.split('\t')[3]) + max = end - start + 1 + while True: + line = fileHandler.readline() + if line == "": + break + start = int (line.split('\t')[2]) + end = int (line.split('\t')[3]) + currentMax = end - start + 1 + if max <= currentMax: + max = currentMax + fileHandler.close() + return max \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/Match.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Match.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,206 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +from commons.core.coord.Range import Range +from commons.core.coord.Path import Path + + +## Handle a chain of match(es) between two sequences, query and subject, with an identifier and the length of the input sequences +# +class Match( Path ): + + ## Constructor + # + def __init__(self): + Path.__init__(self) + self.query_length = -1 + self.query_length_perc = -1 # length of the match on the query / length of the query + self.query_seqlength = -1 + self.match_length_perc = -1 # length of the match on the query / total length of the subject + self.subject_length = -1 + self.subject_length_perc = -1 # length of the match on the subject / length of the subject + self.subject_seqlength = -1 + + ## Equal operator + # + def __eq__(self, o): + if o == None \ + or self.query_length != o.query_length or self.query_length_perc != o.query_length_perc\ + or self.query_seqlength != o.query_seqlength or self.subject_length != o.subject_length\ + or self.subject_length_perc != o.subject_length_perc or self.subject_seqlength != o.subject_seqlength\ + or self.match_length_perc != o.match_length_perc: + return False + return Path.__eq__(self, o) + + ## Return the length of the match on the query divided by the total length of the query + # + def getLengthPercOnQuery(self): + return self.query_length_perc + + ## Return the length of the match on the subject divided by the total length of the subject + # + def getLengthPercOnSubject(self): + return self.subject_length_perc + + ## Return the length of the match on the subject + # + def getLengthMatchOnSubject(self): + return self.subject_length + + ## Set attributes from a tuple + # + # @param tuple: a tuple with (query name,query start,query end, + # query length, query length perc (between 0-1), match length perc (between 0-1), subject name, + # subject start,subject end,subject length, subject length percentage (between 0-1), e_value,score,identity,id) + # + def setFromTuple( self, tuple ): + queryStart = int(tuple[1]) + queryEnd = int(tuple[2]) + subjectStart = int(tuple[7]) + subjectEnd = int(tuple[8]) + if queryStart < queryEnd: + self.range_query = Range(tuple[0],queryStart,queryEnd) + self.range_subject = Range(tuple[6],subjectStart,subjectEnd) + else: + self.range_query = Range(tuple[0],queryEnd,queryStart) + self.range_subject = Range(tuple[6],subjectEnd,subjectStart) + self.query_length = int(tuple[3]) + self.query_length_perc = float(tuple[4]) + self.query_seqlength = int( self.query_length / self.query_length_perc ) + self.match_length_perc = float(tuple[5]) + self.subject_length = int(tuple[9]) + self.subject_length_perc = float(tuple[10]) + self.subject_seqlength = int( self.subject_length / self.subject_length_perc ) + self.e_value = float(tuple[11]) + self.score = float(tuple[12]) + self.identity = float(tuple[13]) + self.id = int(tuple[14]) + + ## Reset + # + def reset( self ): + Path.reset( self ) + self.query_length = -1 + self.query_length_perc = -1 + self.query_seqlength = -1 + self.match_length_perc = -1 + self.subject_length = -1 + self.subject_length_perc = -1 + self.subject_seqlength = -1 + + ## Return a formated string of the attribute data + # + def toString( self ): + string = "%s" % ( self.range_query.toString() ) + string += "\t%i\t%f" % ( self.query_length, + self.query_length_perc ) + string += "\t%f" % ( self.match_length_perc ) + string += "\t%s" % ( self.range_subject.toString() ) + string += "\t%i\t%f" % ( self.subject_length, + self.subject_length_perc ) + string += "\t%g\t%i\t%f" % ( self.e_value, + self.score, + self.identity ) + string += "\t%i" % ( self.id ) + return string + + ## Return a Path instance + # + def getPathInstance( self ): + p = Path() + tuple = ( self.id, + self.range_query.seqname, + self.range_query.start, + self.range_query.end, + self.range_subject.seqname, + self.range_subject.start, + self.range_subject.end, + self.e_value, + self.score, + self.identity ) + p.setFromTuple( tuple ) + return p + + ## Give information about a match whose query is included in the subject + # + # @return string + # + def getQryIsIncluded( self ): + string = "query %s (%d bp: %d-%d) is contained in subject %s (%d bp: %d-%d): id=%.2f - %.3f - %.3f - %.3f" %\ + ( self.range_query.seqname, self.query_seqlength, self.range_query.start, self.range_query.end, + self.range_subject.seqname, self.subject_seqlength, self.range_subject.start, self.range_subject.end, + self.identity, self.query_length_perc, self.match_length_perc, self.subject_length_perc ) + return string + + def increaseLengthPercOnQuery(self, coverage): + self.query_length_perc += coverage + + ## Compare the object with another match and see if they are equal + # (same identity, E-value and score + same subsequences whether in query or subject) + # + # @return True if objects are equals False otherwise + # + def isDoublonWith( self, match, verbose=0 ): + + # if both matches have same identity, score and E-value + if self.identity == match.identity and self.score == match.score and self.e_value == match.e_value: + + # if query and subject are identical + if ( self.range_query.seqname == match.range_query.seqname \ + and self.range_subject.seqname == match.range_subject.seqname ): + + # if the coordinates are equal + if self.range_query.__eq__( match.range_query ) and self.range_subject.__eq__( match.range_subject ): + return True + + else: + if verbose > 0: print "different coordinates"; sys.stdout.flush() + return False + + # if query and subject are reversed but identical + elif self.range_query.seqname == match.range_subject.seqname and self.range_subject.seqname == match.range_query.seqname: + + # if the coordinates are equal + if self.range_query.__eq__( match.range_subject ) and self.range_subject.__eq__( match.range_query ): + return True + + else: + if verbose > 0: print "different coordinates"; sys.stdout.flush() + return False + + else: + if verbose > 0: print "different sequence names"; sys.stdout.flush() + return False + + else: + if verbose > 0: print "different match numbers"; sys.stdout.flush() + return False diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/MatchUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MatchUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,288 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import math +import os +import sys +from commons.core.coord.Match import Match +from commons.core.checker.RepetException import RepetException + +## Static methods for the manipulation of Match instances +# +class MatchUtils ( object ): + + ## Return a list with Match instances from the given file + # + # @param inFile name of a file in the Match format + # @return a list of Match instances + # + def getMatchListFromFile(inFile ): + lMatchInstances = [] + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + if line[0:10] == "query.name": + continue + m = Match() + m.setFromString( line ) + lMatchInstances.append( m ) + inFileHandler.close() + return lMatchInstances + + getMatchListFromFile = staticmethod( getMatchListFromFile ) + + ## Split a Match list in several Match lists according to the subject + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are subject names and values Match lists + # + def getDictOfListsWithSubjectAsKey( lMatches ): + dSubject2MatchList = {} + for iMatch in lMatches: + if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ): + dSubject2MatchList[ iMatch.range_subject.seqname ] = [] + dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch ) + return dSubject2MatchList + + getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey ) + + ## Split a Match list in several Match lists according to the query + # + # @param lMatches a list of Match instances + # @return a dictionary which keys are query names and values Match lists + # + def getDictOfListsWithQueryAsKey ( lMatches ): + dQuery2MatchList = {} + for iMatch in lMatches: + if not dQuery2MatchList.has_key( iMatch.range_query.seqname ): + dQuery2MatchList[ iMatch.range_query.seqname ] = [] + dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch ) + return dQuery2MatchList + + getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey ) + + ## Write Match instances contained in the given list + # + # @param lMatches a list of Match instances + # @param fileName name of the file to write the Match instances + # @param mode the open mode of the file ""w"" or ""a"" + # + def writeListInFile( lMatches, fileName, mode="w", header=None ): + fileHandler = open( fileName, mode ) + if header: + fileHandler.write( header ) + for iMatch in lMatches: + iMatch.write( fileHandler ) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + ## Give path id list from a list of Match instances + # + # @param lMatch list of Match instances + # + # @return lId integer list + # + def getIdListFromMatchList(lMatch): + lId = [] + for iMatch in lMatch: + lId.append(iMatch.id) + return lId + + getIdListFromMatchList = staticmethod(getIdListFromMatchList) + + ## Remove duplicated matches in a match list + # ## replace old PyRepet.MatchDB.rmvDoublons() + # @param lMatch list of Match instances + # + # @return lMatchesUniq match unique list + # + def rmvDuplicateMatches(lMatch): + lMatchesUniq = [] + for match in lMatch: + if len(lMatchesUniq) == 0: + lMatchesUniq.append( match ) + else: + nbDoublons = 0 + for m in lMatchesUniq: + if match.isDoublonWith( m ): + nbDoublons += 1 + if nbDoublons == 0: + lMatchesUniq.append( match ) + + for match1 in lMatchesUniq: + for match2 in lMatchesUniq: + if match1.id != match2.id: + if match1.isDoublonWith( match2 ): + raise RepetException ( "*** Error: doublon not removed" ) + return lMatchesUniq + + rmvDuplicateMatches = staticmethod(rmvDuplicateMatches) + + ## Return the list of queries 'included' in subjects when two different databanks are used. + ##replace old pyRepet.MatchDB.filterDiffQrySbj() + # + # @param iBioseqDB bioseqDB databank of queries + # + # @param thresIdentity float identity threshold + # + # @param thresLength float length threshold + # + # @param verbose int verbosity + # + # @return lMatches match list to keep according to length and identity thresholds + #TODO: don't take into account match for sequence against itself. To do ? + def filterDiffQrySbj(iBioseqDB, matchFile, thresIdentity=0.95, thresLength=0.98, verbose=0 ): + if verbose > 0: + print "filtering matches (id>=%.2f,qlgth>=%.2f)..." % ( thresIdentity, thresLength ); sys.stdout.flush() + + thresIdentityPerc = math.floor( thresIdentity*100 ) + lQryToKeep = [] + dQry2Matches = MatchUtils.getDictOfListsWithQueryAsKey(MatchUtils.getMatchListFromFile(matchFile)) + + for seqH in iBioseqDB.idx.keys(): + # keep it if it has no match + if not dQry2Matches.has_key( seqH ): + if seqH not in lQryToKeep: + lQryToKeep.append( seqH ) + else: + isConditionsMet = False + for match in dQry2Matches[ seqH ]: + # check if they are above the thresholds + if match.identity >= thresIdentityPerc and match.query_length_perc >= thresLength: + isConditionsMet = True + break + if not isConditionsMet and seqH not in lQryToKeep: + lQryToKeep.append( seqH ) + return lQryToKeep + + filterDiffQrySbj = staticmethod(filterDiffQrySbj) + + ## Count the number of distinct matches involved in at least one match above the thresholds. + ##replace old pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() and pyRepet.coord.MatchDB.getNbDistinctSbjWithThres() + # @param thresIdentity float identity threshold + # + # @param thresLength float length threshold + # + def getNbDistinctSequencesInsideMatchesWithThresh(lMatches, thresIdentity=0.95, thresLength=0.98, whatToCount="query" ): + thresIdentityPerc = math.floor( thresIdentity*100 ) + countSbj = 0 + if whatToCount.lower() == "query": + dMatches = MatchUtils.getDictOfListsWithQueryAsKey(lMatches) + else: + dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches) + + for qry in dMatches.keys(): + countMatch = 0 + for match in dMatches[ qry ]: + + if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength: + countMatch += 1 + if countMatch > 0: + countSbj += 1 + return countSbj + + getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh) + + ## Convert a 'match' file (output from Matcher) into an 'align' file + ## replace old parser.tab2align + # + # @param inFileName a string input file name + # + def convertMatchFileToAlignFile(inFileName): + basename = os.path.splitext(inFileName)[0] + outFileName = "%s.align" % basename + outFile = open(outFileName, "w") + + lMatches = MatchUtils.getMatchListFromFile(inFileName) + + for match in lMatches: + string = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() ) + outFile.write( string ) + + outFile.close() + + convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile) + + ## Convert a 'match' file (output from Matcher) into an 'abc' file (MCL input file) + # Use coverage on query for arc value + # + # @param matchFileName string input match file name + # @param outFileName string output abc file name + # @param coverage float query coverage filter threshold + # + @staticmethod + def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0): + with open(matchFileName) as inF: + with open(outFileName, "w") as outF: + inF.readline() + inLine = inF.readline() + while inLine: + splittedLine = inLine.split("\t") + if float(splittedLine[4]) >= coverage: + outLine = "\t".join([splittedLine[0], splittedLine[6], splittedLine[4]]) + outLine += "\n" + outF.write(outLine) + inLine = inF.readline() + + ## Adapt the path IDs as the input file is the concatenation of several 'Match' files, and remove the extra header lines. + ## replace old parser.tabnum2id + # + # @param fileName a string input file name + # @param outputFileName a string output file name (optional) + # + def generateMatchFileWithNewPathId(fileName, outputFileName=None): + if outputFileName is None: + outFile = open(fileName, "w") + else: + outFile = open(outputFileName, "w") + outFile.write("query.name\tquery.start\tquery.end\tquery.length\tquery.length.%\tmatch.length.%\tsubject.name\tsubject.start\tsubject.end\tsubject.length\tsubject.length.%\tE.value\tScore\tIdentity\tpath\n") + + lMatches = MatchUtils.getMatchListFromFile(fileName) + count = 1 + dMatchKeyIdcount = {} + + for match in lMatches: + key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName() + if not key_id in dMatchKeyIdcount.keys(): + newPath = count + count += 1 + dMatchKeyIdcount[ key_id ] = newPath + else: + newPath = dMatchKeyIdcount[ key_id ] + + match.id = newPath + outFile.write( match.toString()+"\n" ) + outFile.close() + + generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId) + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/MergedRange.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MergedRange.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,98 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +## Record a region on multiple sequence using Path ID information +# +class MergedRange(object): + + ## Constructor + # + # @param lId list of Path ID + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, lId = None, start = -1, end = -1): + self._lId = lId or [] + self._start = start + self._end = end + + ## Equal operator + # + # @param o a MergedRange instance + # + def __eq__(self, o): + return o._lId == self._lId and o._start == self._start and o._end == self._end + + + ## Return True if the MergedRange instance overlaps with another MergedRange instance, False otherwise + # + # @param o a MergedRange instance + # @return boolean False or True + # + def isOverlapping(self, o): + if o._start <= self._start and o._end >= self._end: + return True + if o._start >= self._start and o._start <= self._end or o._end >= self._start and o._end <= self._end: + return True + return False + + ## Merge coordinates and ID of two Merged Range + # + # @param o a MergedRange instance + # + def merge(self, o): + self._start = min(self._start, o._start) + self._end = max(self._end, o._end) + self._lId.extend(o._lId) + self._lId.sort() + + ## Set a Merged Range instance using a Match instance + # + # @param iMatch instance Match instance + # + def setFromMatch(self, iMatch): + self._lId= [iMatch.id] + self._start = iMatch.range_query.start + self._end = iMatch.range_query.end + + ## Get a Merged Range instance list using a Match instance list + # + # @param lIMatch list Match instance list + # @return lMergedRange list MergedRange instance list + # + def getMergedRangeListFromMatchList(lIMatch): + lMergedRange = [] + for iMatch in lIMatch: + mr = MergedRange() + mr.setFromMatch(iMatch) + lMergedRange.append(mr) + return lMergedRange + + getMergedRangeListFromMatchList = staticmethod(getMergedRangeListFromMatchList) \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/Path.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Path.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,149 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Align import Align +from commons.core.coord.Set import Set +from commons.core.coord.Range import Range + + +## Handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) with an identifier +# +class Path( Align ): + + ## Constructor + # + # @param id identifier + # @param range_q: a Range instance for the query + # @param range_s: a Range instance for the subject + # @param e_value: E-value of the match + # @param score: score of the match + # @param identity: identity percentage of the match + # + def __init__( self, id=-1, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0 ): + self.id = int( id ) + Align.__init__( self, range_q, range_s, e_value, score, identity ) + + ## Equal operator + # + def __eq__(self, o): + if o == None or self.id != o.id: + return False + return Align.__eq__(self, o) + + ## Set attributes from tuple + # + # @param tuple a tuple with (id,queryName,queryStart,queryEnd,subjectName,subjectStar,subjectEnd,E-value,score,identity) + # @note data are loaded such that the query is always on the direct strand + # + def setFromTuple(self, tuple): + self.id = int(tuple[0]) + Align.setFromTuple(self, tuple[1:]) + + ## Reset + # + def reset(self): + self.id = -1 + Align.reset(self) + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%i" % ( self.id ) + string += "\t%s" % (Align.toString(self)) + return string + + + ## Return the identifier of the Path instance + # + def getIdentifier( self ): + return self.id + + ## Return a Set instance with the subject mapped on the query + # + def getSubjectAsSetOfQuery(self): + iSet = Set() + iSet.id = self.id + iSet.name = self.range_subject.seqname + iSet.seqname = self.range_query.seqname + if self.range_subject.isOnDirectStrand(): + iSet.start = self.range_query.start + iSet.end = self.range_query.end + else: + iSet.start = self.range_query.end + iSet.end = self.range_query.start + return iSet + + #TODO: add tests !!!! + #WARNING: subject always in direct strand !!! + ## Return a Set instance with the subject mapped on the query + # + def getQuerySetOfSubject(self): + iSet = Set() + iSet.id = self.id + iSet.name = self.range_query.seqname + iSet.seqname = self.range_subject.seqname + if self.range_subject.isOnDirectStrand(): + iSet.start = self.range_subject.start + iSet.end = self.range_subject.end + else: + iSet.start = self.range_subject.end + iSet.end = self.range_subject.start + return iSet + + ## Return True if the instance can be merged with another Path instance, False otherwise + # + # @param o a Path instance + # + def canMerge(self, o): + return o.id != self.id \ + and o.range_query.seqname == self.range_query.seqname \ + and o.range_subject.seqname == self.range_subject.seqname \ + and o.range_query.isOnDirectStrand() == self.range_query.isOnDirectStrand() \ + and o.range_subject.isOnDirectStrand() == self.range_subject.isOnDirectStrand() \ + and o.range_query.isOverlapping(self.range_query) \ + and o.range_subject.isOverlapping(self.range_subject) + + ## Return an Align instance with the same attributes, except the identifier + # + def getAlignInstance(self): + iAlign = Align() + lAttributes = [] + lAttributes.append( self.range_query.seqname ) + lAttributes.append( self.range_query.start ) + lAttributes.append( self.range_query.end ) + lAttributes.append( self.range_subject.seqname ) + lAttributes.append( self.range_subject.start ) + lAttributes.append( self.range_subject.end ) + lAttributes.append( self.e_value ) + lAttributes.append( self.score ) + lAttributes.append( self.identity ) + iAlign.setFromTuple( lAttributes ) + return iAlign diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/PathUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/PathUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,858 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import copy +from commons.core.coord.Path import Path +from commons.core.coord.SetUtils import SetUtils +from commons.core.coord.Map import Map +from commons.core.coord.AlignUtils import AlignUtils +from commons.core.checker.RepetException import RepetDataException + +## Static methods for the manipulation of Path instances +# +class PathUtils ( object ): + + ## Change the identifier of each Set instance in the given list + # + # @param lPaths list of Path instances + # @param newId new identifier + # + def changeIdInList(lPaths, newId): + for iPath in lPaths: + iPath.id = newId + + changeIdInList = staticmethod( changeIdInList ) + + + ## Return a list of Set instances containing the query range from a list of Path instances + # + # @param lPaths a list of Path instances + # + def getSetListFromQueries(lPaths): + lSets = [] + for iPath in lPaths: + lSets.append( iPath.getSubjectAsSetOfQuery() ) + return lSets + + getSetListFromQueries = staticmethod( getSetListFromQueries ) + + #TODO: add tests !!!! + ## Return a list of Set instances containing the query range from a list of Path instances + # + # @param lPaths a list of Path instances + # + @staticmethod + def getSetListFromSubjects(lPaths): + lSets = [] + for iPath in lPaths: + lSets.append( iPath.getQuerySetOfSubject() ) + return lSets + + + ## Return a sorted list of Range instances containing the subjects from a list of Path instances + # + # @param lPaths a list of Path instances + # @note meaningful only if all Path instances have same identifier + # + def getRangeListFromSubjects( lPaths ): + lRanges = [] + for iPath in lPaths: + lRanges.append( iPath.range_subject ) + if lRanges[0].isOnDirectStrand(): + return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) ) + else: + return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) ) + + getRangeListFromSubjects = staticmethod( getRangeListFromSubjects ) + + + ## Return a tuple with min and max of query coordinates from Path instances in the given list + # + # @param lPaths a list of Path instances + # + def getQueryMinMaxFromPathList(lPaths): + qmin = -1 + qmax = -1 + for iPath in lPaths: + if qmin == -1: + qmin = iPath.range_query.start + qmin = min(qmin, iPath.range_query.getMin()) + qmax = max(qmax, iPath.range_query.getMax()) + return (qmin, qmax) + + getQueryMinMaxFromPathList = staticmethod( getQueryMinMaxFromPathList ) + + + ## Return a tuple with min and max of subject coordinates from Path instances in the given list + # + # @param lPaths lists of Path instances + # + def getSubjectMinMaxFromPathList(lPaths): + smin = -1 + smax = -1 + for iPath in lPaths: + if smin == -1: + smin = iPath.range_subject.start + smin = min(smin, iPath.range_subject.getMin()) + smax = max(smax, iPath.range_subject.getMax()) + return (smin, smax) + + getSubjectMinMaxFromPathList = staticmethod( getSubjectMinMaxFromPathList ) + + + ## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list + # + # @param lPaths1: list of Path instances + # @param lPaths2: list of Path instances + # @return boolean + # + def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2 ): + lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 ) + lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 ) + i = 0 + j = 0 + while i != len(lSortedPaths1): + while j != len(lSortedPaths2): + if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ): + j += 1 + else: + return True + i += 1 + return False + + areQueriesOverlappingBetweenPathLists = staticmethod( areQueriesOverlappingBetweenPathLists ) + + + ## Show Path instances contained in the given list + # + # @param lPaths a list of Path instances + # + def showList(lPaths): + for iPath in lPaths: + iPath.show() + + showList = staticmethod( showList ) + + + ## Write Path instances contained in the given list + # + # @param lPaths a list of Path instances + # @param fileName name of the file to write the Path instances + # @param mode the open mode of the file ""w"" or ""a"" + # + def writeListInFile(lPaths, fileName, mode="w"): + AlignUtils.writeListInFile(lPaths, fileName, mode) + + writeListInFile = staticmethod( writeListInFile ) + + + ## Return new list of Path instances with no duplicate + # + # @param lPaths a list of Path instances + # @param useOnlyCoord boolean if True, check only coordinates and sequence names + # @return lUniqPaths a path instances list + # + def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False): + if len(lPaths) < 2: + return lPaths + lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) + lUniqPaths = [ lSortedPaths[0] ] + if useOnlyCoord: + for iPath in lSortedPaths[1:]: + if iPath.range_query.start != lUniqPaths[-1].range_query.start \ + or iPath.range_query.end != lUniqPaths[-1].range_query.end \ + or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \ + or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \ + or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \ + or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname: + lUniqPaths.append( iPath ) + else: + for iPath in lSortedPaths[1:]: + if iPath != lUniqPaths[-1]: + lUniqPaths.append( iPath ) + return lUniqPaths + + getPathListWithoutDuplicates = staticmethod( getPathListWithoutDuplicates ) + + + def getPathListWithoutDuplicatesOnQueryCoord(lPaths): + if len(lPaths) < 2: + return lPaths + lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) + lUniqPaths = [ lSortedPaths[0] ] + for iPath in lSortedPaths[1:]: + if iPath.range_query.start != lUniqPaths[-1].range_query.start \ + or iPath.range_query.end != lUniqPaths[-1].range_query.end \ + or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname: + lUniqPaths.append( iPath ) + return lUniqPaths + + getPathListWithoutDuplicatesOnQueryCoord = staticmethod(getPathListWithoutDuplicatesOnQueryCoord) + + + ## Split a Path list in several Path lists according to the identifier + # + # @param lPaths a list of Path instances + # @return a dictionary which keys are identifiers and values Path lists + # + def getDictOfListsWithIdAsKey( lPaths ): + dId2PathList = {} + for iPath in lPaths: + if dId2PathList.has_key( iPath.id ): + dId2PathList[ iPath.id ].append( iPath ) + else: + dId2PathList[ iPath.id ] = [ iPath ] + return dId2PathList + + getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) + + + ## Split a Path file in several Path lists according to the identifier + # + # @param pathFile name of the input Path file + # @return a dictionary which keys are identifiers and values Path lists + # + def getDictOfListsWithIdAsKeyFromFile( pathFile ): + dId2PathList = {} + pathFileHandler = open( pathFile, "r" ) + while True: + line = pathFileHandler.readline() + if line == "": + break + iPath = Path() + iPath.setFromString( line ) + if dId2PathList.has_key( iPath.id ): + dId2PathList[ iPath.id ].append( iPath ) + else: + dId2PathList[ iPath.id ] = [ iPath ] + pathFileHandler.close() + return dId2PathList + + getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) + + + ## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates + # + # @param lToKeep: a list of Path instances to keep (reference) + # @param lToUnjoin: a list of Path instances to unjoin + # @return: list of Path list(s) (can be empty if one of the input lists is empty) + # @warning: all the path instances in a given list MUST be connected (i.e. same identifier) + # @warning: all the path instances in a given list MUST NOT overlap neither within each other nor with the Path instances of the other list + # + def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ): + lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep ) + lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin ) + if lToUnjoin == []: + return [] + if lToKeep == []: + return [ lToUnjoin ] + + lLists = [] + k = 0 + while k < len(lSortedToKeep): + j1 = 0 + while j1 < len(lSortedToUnjoin) and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax(): + j1 += 1 + if j1 == len(lSortedToUnjoin): + break + if j1 != 0: + lLists.append( lSortedToUnjoin[:j1] ) + del lSortedToUnjoin[:j1] + j1 = 0 + if k+1 == len(lSortedToKeep): + break + j2 = j1 + if j2 < len(lSortedToUnjoin) and lSortedToKeep[k+1].range_query.getMin() > lSortedToUnjoin[j2].range_query.getMax(): + while j2 < len(lSortedToUnjoin) and lSortedToKeep[k+1].range_query.getMin() > lSortedToUnjoin[j2].range_query.getMax(): + j2 += 1 + lLists.append( lSortedToUnjoin[j1:j2] ) + del lSortedToUnjoin[j1:j2] + k += 1 + + if lLists != [] or k == 0: + lLists.append( lSortedToUnjoin ) + return lLists + + getPathListUnjoinedBasedOnQuery = staticmethod( getPathListUnjoinedBasedOnQuery ) + + + ## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range + # All Paths should have the same query and subject. + # The Paths are merged using query coordinates only. + # + # @param lPaths list of Path instances + # + def getIdentityFromPathList( lPaths, checkSubjects=True ): + if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1: + msg = "ERROR: try to compute identity from Paths with different queries" + sys.stderr.write( "%s\n" % msg ) + sys.stderr.flush() + raise Exception + if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1: + msg = "ERROR: try to compute identity from Paths with different subjects" + sys.stderr.write( "%s\n" % msg ) + sys.stderr.flush() + raise Exception + identity = 0 + lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths ) + lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths ) + lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets ) + totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets ) + for iPath in lMergedPaths: + identity += iPath.identity * iPath.getLengthOnQuery() + weightedIdentity = identity / float(totalLengthOnQry) + if weightedIdentity < 0: + msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity + sys.stderr.write("%s\n" % msg) + sys.stderr.flush() + raise Exception + elif weightedIdentity > 100: + msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity + sys.stderr.write("%s\n" % msg) + sys.stderr.flush() + raise RepetDataException(msg) + return weightedIdentity + + getIdentityFromPathList = staticmethod( getIdentityFromPathList ) + + + ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order. + # + # @param lPaths list of Path instances + # + def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths): + return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) ) + + getPathListSortedByIncreasingMinQueryThenMaxQuery = staticmethod( getPathListSortedByIncreasingMinQueryThenMaxQuery ) + + + ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order. + # + # @param lPaths list of Path instances + # + def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths): + return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) ) + + getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier = staticmethod( getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier ) + + + ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order. + # + # @param lPaths list of Path instances + # + @staticmethod + def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths): + return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax())) + + + ## Return a list of the distinct identifiers + # + # @param lPaths list of Path instances + # + def getListOfDistinctIdentifiers( lPaths ): + sDistinctIdentifiers = set() + for iPath in lPaths: + sDistinctIdentifiers.add(iPath.id) + return list(sDistinctIdentifiers) + + getListOfDistinctIdentifiers = staticmethod( getListOfDistinctIdentifiers ) + + + ## Return a list of the distinct query names present in the collection + # + # @param lPaths list of Path instances + # + def getListOfDistinctQueryNames( lPaths ): + sDistinctQueryNames = set() + for iPath in lPaths: + sDistinctQueryNames.add(iPath.range_query.seqname) + return list(sDistinctQueryNames) + + getListOfDistinctQueryNames = staticmethod( getListOfDistinctQueryNames ) + + + ## Return a list of the distinct subject names present in the collection + # + # @param lPaths list of Path instances + # + def getListOfDistinctSubjectNames( lPaths ): + sDistinctSubjectNames = set() + for iPath in lPaths: + sDistinctSubjectNames.add(iPath.range_subject.seqname) + return list(sDistinctSubjectNames) + + getListOfDistinctSubjectNames = staticmethod( getListOfDistinctSubjectNames ) + + + ## Return a list of lists containing query coordinates of the connections sorted in increasing order. + # + # @param lConnectedPaths: list of Path instances having the same identifier + # @param minLength: threshold below which connections are not reported (default= 0 bp) + # @note: return only connections longer than threshold + # @note: if coordinate on query ends at 100, return 101 + # @warning: Path instances MUST be sorted in increasing order according to query coordinates + # @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand) + # + def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0): + lJoinCoordinates = [] + for i in xrange(1,len(lConnectedPaths)): + startJoin = lConnectedPaths[i-1].range_query.end + endJoin = lConnectedPaths[i].range_query.start + if endJoin - startJoin + 1 > minLength: + lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] ) + return lJoinCoordinates + + getListOfJoinCoordinatesOnQuery = staticmethod( getListOfJoinCoordinatesOnQuery ) + + + ## Return the length on the query of all Path instance in the given list + # + # @param lPaths list of Path instances + # @note overlapping ranges are not summed but truncated. + # + def getLengthOnQueryFromPathList( lPaths ): + lSets = PathUtils.getSetListFromQueries( lPaths ) + lMergedSets = SetUtils.mergeSetsInList( lSets ) + length = SetUtils.getCumulLength( lMergedSets ) + return length + + getLengthOnQueryFromPathList = staticmethod( getLengthOnQueryFromPathList ) + + + ## Convert a Path file into an Align file + # + # @param pathFile: name of the input Path file + # @param alignFile: name of the output Align file + # + def convertPathFileIntoAlignFile(pathFile, alignFile): + pathFileHandler = open( pathFile, "r" ) + alignFileHandler = open( alignFile, "w" ) + iPath = Path() + while True: + line = pathFileHandler.readline() + if line == "": + break + iPath.setFromString( line ) + iAlign = iPath.getAlignInstance() + iAlign.write( alignFileHandler ) + pathFileHandler.close() + alignFileHandler.close() + + convertPathFileIntoAlignFile = staticmethod( convertPathFileIntoAlignFile ) + + #TODO: duplicated method => to rename with the name of the next method (which is called) ? + ## Convert a Path File into a Map file with query coordinates only + # + # @param pathFile: name of the input Path file + # @param mapFile: name of the output Map file + # + def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ): + pathFileHandler = open( pathFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + p = Path() + while True: + line = pathFileHandler.readline() + if line == "": + break + p.reset() + p.setFromTuple( line.split("\t") ) + p.writeSubjectAsMapOfQuery( mapFileHandler ) + pathFileHandler.close() + mapFileHandler.close() + + convertPathFileIntoMapFileWithQueryCoordsOnly = staticmethod( convertPathFileIntoMapFileWithQueryCoordsOnly ) + + + ## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file + # + # @param pathFile: name of the input Path file + # @param mapFile: name of the output Map file + # + def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ): + PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ) + convertPathFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertPathFileIntoMapFileWithSubjectsOnQueries ) + + + ## Merge matches on queries + # + # @param inFile: name of the input Path file + # @param outFile: name of the output Path file + # + def mergeMatchesOnQueries(inFile, outFile): + mapFile = "%s.map" % ( inFile ) + PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( inFile, mapFile ) + cmd = "mapOp" + cmd += " -q %s" % ( mapFile ) + cmd += " -m" + cmd += " 2>&1 > /dev/null" + exitStatus = os.system( cmd ) + if exitStatus != 0: + print "ERROR: mapOp returned %i" % ( exitStatus ) + sys.exit(1) + os.remove( mapFile ) + mergeFile = "%s.merge" % ( mapFile ) + mergeFileHandler = open( mergeFile, "r" ) + outFileHandler = open( outFile, "w" ) + m = Map() + while True: + line = mergeFileHandler.readline() + if line == "": + break + m.reset() + m.setFromString( line, "\t" ) + m.writeAsQueryOfPath( outFileHandler ) + mergeFileHandler.close() + os.remove( mergeFile ) + outFileHandler.close() + + mergeMatchesOnQueries = staticmethod( mergeMatchesOnQueries ) + + + ## Filter chains of Path(s) which length is below a given threshold + # + # @param lPaths: list of Path instances + # @param minLengthChain: minimum length of a chain to be kept + # @note: a chain may contain a single Path instance + # @return: a list of Path instances + # + def filterPathListOnChainLength( lPaths, minLengthChain ): + lFilteredPaths = [] + dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths ) + for pathnum in dPathnum2Paths.keys(): + length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] ) + if length >= minLengthChain: + lFilteredPaths += dPathnum2Paths[ pathnum ] + return lFilteredPaths + + filterPathListOnChainLength = staticmethod( filterPathListOnChainLength ) + + + ## Return a Path list from a Path file + # + # @param pathFile string name of a Path file + # @return a list of Path instances + # + def getPathListFromFile( pathFile ): + lPaths = [] + pathFileHandler = open( pathFile, "r" ) + while True: + line = pathFileHandler.readline() + if line == "": + break + iPath = Path() + iPath.setFromString( line ) + lPaths.append( iPath ) + pathFileHandler.close() + return lPaths + + getPathListFromFile = staticmethod( getPathListFromFile ) + + + ## Convert a chain into a 'pathrange' + # + # @param lPaths a list of Path instances with the same identifier + # @note: the min and max of each Path is used + # + def convertPathListToPathrange( lPaths ): + if len(lPaths) == 0: + return + if len(lPaths) == 1: + return lPaths[0] + iPathrange = copy.deepcopy( lPaths[0] ) + iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery() + cumulQueryLength = iPathrange.getLengthOnQuery() + for iPath in lPaths[1:]: + if iPath.id != iPathrange.id: + msg = "ERROR: two Path instances in the chain have different identifiers" + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand(): + msg = "ERROR: two Path instances in the chain are on different strands" + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start ) + iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end ) + if iPathrange.range_subject.isOnDirectStrand(): + iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start ) + iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end ) + else: + iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start ) + iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end ) + iPathrange.e_value = min( iPathrange.e_value, iPath.e_value ) + iPathrange.score += iPath.score + iPathrange.identity += iPath.identity * iPath.getLengthOnQuery() + cumulQueryLength += iPath.getLengthOnQuery() + iPathrange.identity = iPathrange.identity / float(cumulQueryLength) + return iPathrange + + convertPathListToPathrange = staticmethod( convertPathListToPathrange ) + + + ## Convert a Path file into an Align file via 'pathrange' + # + # @param pathFile: name of the input Path file + # @param alignFile: name of the output Align file + # @param verbose integer verbosity level + # @note: the min and max of each Path is used + # + def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ): + lPaths = PathUtils.getPathListFromFile( pathFile ) + dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths ) + lIds = dId2PathList.keys() + lIds.sort() + if verbose > 0: + msg = "number of chains: %i" % ( len(lIds) ) + sys.stdout.write( "%s\n" % ( msg ) ) + sys.stdout.flush() + alignFileHandler = open( alignFile, "w" ) + for identifier in lIds: + iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] ) + iAlign = iPath.getAlignInstance() + iAlign.write( alignFileHandler ) + alignFileHandler.close() + + convertPathFileIntoAlignFileViaPathrange = staticmethod( convertPathFileIntoAlignFileViaPathrange ) + + + ## Split a list of Path instances according to the name of the query + # + # @param lInPaths list of align instances + # @return lOutPathLists list of align instances lists + # + def splitPathListByQueryName( lInPaths ): + lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname ) + lOutPathLists = [] + if len(lInSortedPaths) != 0 : + lPathsForCurrentQuery = [] + previousQuery = lInSortedPaths[0].range_query.seqname + for iPath in lInSortedPaths : + currentQuery = iPath.range_query.seqname + if previousQuery != currentQuery : + lOutPathLists.append( lPathsForCurrentQuery ) + previousQuery = currentQuery + lPathsForCurrentQuery = [] + lPathsForCurrentQuery.append( iPath ) + + lOutPathLists.append(lPathsForCurrentQuery) + + return lOutPathLists + + splitPathListByQueryName = staticmethod( splitPathListByQueryName ) + + + ## Create an Path file from each list of Path instances in the input list + # + # @param lPathList list of lists with Path instances + # @param pattern string + # @param dirName string + # + def createPathFiles( lPathList, pattern, dirName="" ): + nbFiles = len(lPathList) + countFile = 1 + if dirName != "" : + if dirName[-1] != "/": + dirName = dirName + '/' + os.mkdir( dirName ) + + for lPath in lPathList: + fileName = dirName + pattern + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) ) + PathUtils.writeListInFile( lPath, fileName ) + countFile += 1 + + createPathFiles = staticmethod( createPathFiles ) + + + ## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order + # + # @param lPaths: list of Path instances + # + def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ): + return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) ) + + getPathListSortedByIncreasingQueryMinThenInvQueryLength = staticmethod( getPathListSortedByIncreasingQueryMinThenInvQueryLength ) + + + ## Merge all overlapping Path instances in a list without considering the identifiers + # Start by sorting the Path instances by their increasing min coordinate + # + # @return: a new list with the merged Path instances + # + def mergePathsInList( lPaths ): + lMergedPaths = [] + if len(lPaths)==0: + return lMergedPaths + + lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ) + + prev_count = 0 + for iPath in lSortedPaths[0:]: + if prev_count != len(lSortedPaths): + for i in lSortedPaths[ prev_count + 1: ]: + if iPath.isOverlapping( i ): + iPath.merge( i ) + isAlreadyInList = False + for newPath in lMergedPaths: + if newPath.isOverlapping( iPath ): + isAlreadyInList = True + newPath.merge( iPath ) + lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath + if not isAlreadyInList: + lMergedPaths.append( iPath ) + prev_count += 1 + return lMergedPaths + + mergePathsInList = staticmethod( mergePathsInList ) + + + ## Merge all overlapping Path instances in a list without considering if subjects are overlapping. + # Start by sorting the Path instances by their increasing min coordinate. + # + # @return: a new list with the merged Path instances + # + def mergePathsInListUsingQueryCoordsOnly( lPaths ): + lMergedPaths = [] + if len(lPaths)==0: + return lMergedPaths + + lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ) + + prev_count = 0 + for iPath in lSortedPaths[0:]: + if prev_count != len(lSortedPaths): + for i in lSortedPaths[ prev_count + 1: ]: + if iPath.isQueryOverlapping( i ): + iPath.merge( i ) + isAlreadyInList = False + for newPath in lMergedPaths: + if newPath.isQueryOverlapping( iPath ): + isAlreadyInList = True + newPath.merge( iPath ) + lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath + if not isAlreadyInList: + lMergedPaths.append( iPath ) + prev_count += 1 + return lMergedPaths + + mergePathsInListUsingQueryCoordsOnly = staticmethod( mergePathsInListUsingQueryCoordsOnly ) + + + ## Convert a Path file into a GFF file + # + # @param pathFile: name of the input Path file + # @param gffFile: name of the output GFF file + # @param source: source to write in the GFF file (column 2) + # + # @note the 'path' query is supposed to correspond to the 'gff' first column + # + def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ): + dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile ) + if verbose > 0: + msg = "number of chains: %i" % ( len(dId2PathList.keys()) ) + sys.stdout.write( "%s\n" % msg ) + sys.stdout.flush() + gffFileHandler = open( gffFile, "w" ) + for id in dId2PathList.keys(): + if len( dId2PathList[ id ] ) == 1: + iPath = dId2PathList[ id ][0] + string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(), + source=source ) + gffFileHandler.write( "%s\n" % string ) + else: + iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] ) + string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(), + source=source ) + gffFileHandler.write( "%s\n" % string ) + count = 0 + for iPath in dId2PathList[ id ]: + count += 1 + string = iPath.toStringAsGff( type="match_part", + ID="mp%i-%i" % ( iPath.getIdentifier(), count ), + Parent="ms%i" % iPathrange.getIdentifier(), + source=source ) + gffFileHandler.write( "%s\n" % string ) + gffFileHandler.close() + + convertPathFileIntoGffFile = staticmethod( convertPathFileIntoGffFile ) + + + ## Convert a Path file into a Set file + # replace old parser.pathrange2set + # @param pathFile: name of the input Path file + # @param setFile: name of the output Set file + # + def convertPathFileIntoSetFile( pathFile, setFile ): + pathFileHandler = open( pathFile, "r" ) + setFileHandler = open( setFile, "w" ) + iPath = Path() + while True: + line = pathFileHandler.readline() + if line == "": + break + iPath.setFromString( line ) + iSet = iPath.getSubjectAsSetOfQuery() + iSet.write( setFileHandler ) + pathFileHandler.close() + setFileHandler.close() + + convertPathFileIntoSetFile = staticmethod( convertPathFileIntoSetFile ) + + ## Write Path File without duplicated Path (same query, same subject and same coordinate) + # + # @param inputFile: name of the input Path file + # @param outputFile: name of the output Path file + # + def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile): + f = open(inputFile, "r") + line = f.readline() + previousQuery = "" + previousSubject = "" + lPaths = [] + while line: + iPath = Path() + iPath.setFromString(line) + query = iPath.getQueryName() + subject = iPath.getSubjectName() + if (query != previousQuery or subject != previousSubject) and lPaths != []: + lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths) + PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a") + lPaths = [] + lPaths.append(iPath) + previousQuery = query + previousSubject = subject + line = f.readline() + lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths) + PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a") + f.close() + removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName = staticmethod(removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName) + + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/Range.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Range.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,361 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Record a region on a given sequence +# +class Range( object ): + + ## Constructor + # + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, seqname="", start=-1, end=-1): + self.seqname = seqname + self.start = int(start) + self.end = int(end) + + ## Equal operator + # + # @param o a Range instance + # + def __eq__(self, o): + if self.seqname == o.seqname and self.start == o.start and self.end == o.end: + return True + return False + + ## Unequal operator + # + # @param o a Range instance + # + def __ne__(self, o): + return not self.__eq__(o) + + ## Convert the object into a string + # + # @note used in 'print myObject' + # + def __str__( self ): + return self.toString() + + ## Convert the object into a string + # + # @note used in 'repr(myObject)' for debugging + # + def __repr__( self ): + return self.toString().replace("\t",";") + + def setStart(self, start): + self.start = start + + def setEnd(self, end): + self.end = end + + def setSeqName(self, seqName): + self.seqname = seqName + + ## Reset + # + def reset(self): + self.seqname = "" + self.start = -1 + self.end = -1 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % (self.seqname) + string += "\t%d" % (self.start) + string += "\t%d" % (self.end) + return string + + ## Show the attributes + # + def show(self): + print self.toString() + + ## Return seqname + # + def getSeqname(self): + return self.seqname + + ## Return the start coordinate + # + def getStart(self): + return self.start + + ## Return the end coordinate + # + def getEnd(self): + return self.end + + ## Return the lowest value between start and end coordinates + # + def getMin(self): + return min(self.start, self.end) + + ## Return the greatest value between start and end attributes + # + def getMax(self): + return max(self.start, self.end) + + ## Return True if the instance is on the direct strand, False otherwise + # + def isOnDirectStrand(self): + if self.start <= self.end: + return True + else: + return False + + ## Return True if the instance is on the reverse strand, False otherwise + # + def isOnReverseStrand(self): + return not self.isOnDirectStrand() + + ## Return '+' if the instance is on the direct strand, '-' otherwise + # + def getStrand(self): + if self.isOnDirectStrand(): + return '+' + else: + return '-' + + ## Exchange start and end coordinates + # + def reverse(self): + tmp = self.start + self.start = self.end + self.end = tmp + + ## Return the length of the instance + # + # @warning old name is 'length' + # + def getLength(self): + return int(abs(self.start-self.end))+1 + + ## Return True if the instance is empty, False otherwise + # + def isEmpty(self): + if self.start==self.end and (self.start==0 or self.start==-1): + return True + return False + + ## Set attributes from tuple + # + # @param tuple a tuple with (name,start,end) + # + def setFromTuple(self, tuple): + self.seqname = tuple[0] + self.start = int(tuple[1]) + self.end = int(tuple[2]) + + ## Set attributes from string + # + # @param string a string formatted like namestartend + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + if string[-1] == "\n": + string = string[:-1] + self.setFromTuple( string.split(sep) ) + + ## Merge the instance with another Range instance + # + # @param o a Range instance + # + def merge(self, o): + if self.seqname != o.seqname: + return + if self.isOnDirectStrand(): + self.start = min(self.getMin(), o.getMin()) + self.end = max(self.getMax(), o.getMax()) + else: + self.start = max(self.getMax(), o.getMax()) + self.end = min(self.getMin(), o.getMin()) + + ## Return True if the instance overlaps with another Range instance, False otherwise + # + # @param o a Range instance + # + def isOverlapping(self, o): + if o.seqname != self.seqname: + return False + smin = self.getMin() + smax = self.getMax() + omin = o.getMin() + omax = o.getMax() + if omin <= smin and omax >= smax: + return True + if omin >= smin and omin <= smax or omax >= smin and omax <= smax: + return True + return False + + + ## Return the length of the overlap between the instance and another Range, 0 if no overlap + # + # @param o a Range instance + # + def getOverlapLength( self, o ): + if self.isOverlapping( o ): + if self.isIncludedIn( o ): + return self.getLength() + elif o.isIncludedIn( self ): + return o.getLength() + elif o.getMin() <= self.getMax() and o.getMin() >= self.getMin(): + return self.getMax() - o.getMin() + 1 + elif o.getMax() <= self.getMax() and o.getMax() >= self.getMin(): + return o.getMax() - self.getMin() + 1 + return 0 + + + ## Return True if the instance is included within another Range, False otherwise + # + # @param o a Range instance + # + # @note the min (respectively max) coordinates can be equal + # + def isIncludedIn( self, o ): + if o.seqname != self.seqname: + return False + if self.getMin() >= o.getMin() and self.getMax() <= o.getMax(): + return True + else: + return False + + + ## Return the distance between the start of the instance and the start of another Range instance + # + # @param o a Range instance + # + def getDistance(self, o): + if self.isOnDirectStrand() == o.isOnDirectStrand(): + if self.isOverlapping(o): + return 0 + elif self.isOnDirectStrand(): + if self.start > o.start: + return self.start - o.end + else: + return o.start - self.end + else: + if self.start > o.start: + return self.end - o.start + else: + return o.end - self.start + return -1 + + ## Remove in the instance the region overlapping with another Range instance + # + # @param o a Range instance + # + def diff(self, o): + new_range = Range(self.seqname) + if not self.isOverlapping(o) or self.seqname != o.seqname: + return new_range + + istart = min(self.start, self.end) + iend = max(self.start, self.end) + jstart = min(o.start, o.end) + jend = max(o.start, o.end) + if istart < jstart: + if iend <= jend: + if self.isOnDirectStrand(): + self.start = istart + self.end = jstart - 1 + else: + self.start = jstart - 1 + self.end = istart + else: + if self.isOnDirectStrand(): + self.start = istart + self.end = jstart - 1 + new_range.start = jend + 1 + new_range.end = iend + else: + self.start = jstart - 1; + self.end = istart; + new_range.start = iend + new_range.end = jend + 1 + else: #istart>=jstart + if iend <= jend: + self.start = 0 + self.end = 0 + else: + if self.isOnDirectStrand(): + self.start = jend + 1 + self.end = iend + else: + self.start = iend + self.end = jend + 1 + return new_range + + ## Find the bin that contains the instance and compute its index + # + # @note Required for coordinate indexing via a hierarchical bin system + # + def findIdx(self): + min_lvl = 3 + max_lvl = 6 + for bin_lvl in xrange(min_lvl, max_lvl): + if getBin(self.start, bin_lvl) == getBin(self.end, bin_lvl): + return getIdx(self.start, bin_lvl) + return getIdx(self.start, max_lvl) + + ## Get a bin for fast database access + # + # @return bin number (float) + # + def getBin(self): + for i in xrange(3, 8): + bin_lvl = pow(10, i) + if int(self.start/bin_lvl) == int(self.end/bin_lvl): + return float(bin_lvl+(int(self.start/bin_lvl)/1e10)) + bin_lvl = pow(10, 8) + return float(bin_lvl+(int(self.start/bin_lvl)/1e10)) + + +# Functions + +# Get the bin number of a coordinate according to the bin level. Required for coordinate indexing with hierarchical bin system +# +def getBin(val, bin_lvl): + bin_size = pow(10, bin_lvl) + return long(val / bin_size) + +# Get an index from a coordinate according to the bin level. Required for coordinate indexing with hierarchical bin system +# +def getIdx(val, bin_lvl): + min_lvl = 3 + max_lvl = 6 + if bin_lvl >= max_lvl: + return long((bin_lvl-min_lvl+1)*pow(10,max_lvl)) + return long(((bin_lvl-min_lvl+1)*pow(10,max_lvl))+getBin(val,bin_lvl)) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/Set.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Set.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,125 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Map import Map + + +## Record a named region on a given sequence with an identifier +# +class Set( Map ): + + ## Constructor + # + # @param id identifier + # @param name the name of the region + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, id=-1, name="", seqname="", start=-1, end=-1): + Map.__init__( self, name, seqname, start, end ) + self.id = id + + ## Equal operator + # + def __eq__(self, o): + if self.id != o.id: + return False + else: + return Map.__eq__(self, o) + + def getId(self): + return self.id + + ## Reset + # + def reset(self): + self.setFromTuple([-1, "", "", -1, -1 ]) + + ## Set attributes from tuple + # + # @param tuple: a tuple with (id, name, seqname, start, end) + # + def setFromTuple(self, tuple): + self.id = int(tuple[0]) + Map.setFromTuple(self, tuple[1:]) + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%i" % (self.id) + string += "\t%s" % (Map.toString(self)) + return string + + ## Merge the instance with another Set instance + # + # @param o a Set instance + # + def merge(self, o): + if self.seqname == o.seqname: + Map.merge(self, o) + self.id = min(self.id, o.id) + + ## Return a Map instance with the attributes + # + def getMap(self): + return Map(self.name, self.seqname, self.start, self.end) + + ## Remove in the instance the region overlapping with another Set instance + # + # @param o a Set instance + # + def diff(self, o): + iMap = Map.diff(self, o.getMap()) + new = Set() + if not iMap.isEmpty(): + new.id = self.id + new.name = self.name + new.seqname = self.seqname + new.start = iMap.start + new.end = iMap.end + return new + + ## Return a Map instance with the identifier in the name + # + def set2map(self): + return Map(self.name+"::"+str(self.id),self.seqname,self.start,self.end) + + + def getMapInstance( self ): + iMap = Map() + lAttributes = [] + lAttributes.append( self.name ) + lAttributes.append( self.seqname ) + lAttributes.append( self.start ) + lAttributes.append( self.end ) + iMap.setFromTuple( lAttributes ) + return iMap diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/SetUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/SetUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,553 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Set import Set + +## Static methods for the manipulation of Path instances +# +class SetUtils( object ): + + ## Change the identifier of each Set instance in the given list + # + # @param lSets list of Set instances + # @param newId new identifier + # + def changeIdInList(lSets, newId): + for iSet in lSets: + iSet.id = newId + + changeIdInList = staticmethod( changeIdInList ) + + ## Return the length of the overlap between two lists of Set instances + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # @return length of overlap + # @warning sequence names are supposed to be identical + # + def getOverlapLengthBetweenLists(lSets1, lSets2): + lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) + lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) + osize = 0 + i = 0 + j = 0 + while i!= len(lSet1Sorted): + while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ + and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): + j+=1 + jj=j + while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]): + osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj]) + jj+=1 + i+=1 + return osize + + getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists ) + + ## Return True if the two lists of Set instances overlap, False otherwise + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # + def areSetsOverlappingBetweenLists( lSets1, lSets2 ): + lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) + lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) + i=0 + j=0 + while i!= len(lSet1Sorted): + while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ + and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): + j+=1 + if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]): + return True + i+=1 + return False + + areSetsOverlappingBetweenLists = staticmethod( areSetsOverlappingBetweenLists ) + + ## Merge all overlapping Set instances between two lists of Set and give the next identifier + # + # @param lSets1 list of Set instances + # @param lSets2 list of Set instances + # @param max_id start id value for inserting new Set + # @return a new list of the merged Set instances and the next identifier + # + def getListOfMergedSetsAndNextId(lSets1, lSets2, max_id=0): + lSets_merged = [] + list2merge = SetUtils.getListOfIdListOfOverlappingSets ( lSets1,lSets2 ) + idlist1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) + idlist2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) + if max_id == 0: + max_id = max(idlist1.keys()) + 1 + for i in list2merge: + if i == []: + continue + l = [] + min_id = max(i) + for j in i: + if j>0: + if min_id>j: + min_id=j + l.extend(idlist1[j]) + del idlist1[j] + else: + l.extend(idlist2[j*-1]) + del idlist2[j*-1] + l = SetUtils.mergeSetsInList(l) + SetUtils.changeIdInList(l, min_id) + lSets_merged.extend(l) + for id, alist in idlist1.items(): + lSets_merged.extend(alist) + for id,alist in idlist2.items(): + SetUtils.changeIdInList(alist,max_id) + lSets_merged.extend(alist) + max_id+=1 + return lSets_merged, max_id + + getListOfMergedSetsAndNextId = staticmethod ( getListOfMergedSetsAndNextId ) + +# ## Concatenate two Set instance lists and give the next identifier +# # +# # @param lSets1 list of Set instances +# # @param lSets2 list of Set instances +# # @param maxId start id value for inserting new Set +# # @return a new list of Set instances and the next identifier +# # +# @staticmethod +# def getSetsListOfTwoConcatenatedSetsListAndNextId(lSets1, lSets2, maxId = 0): +# lOutSets = lSets1 +# dId2SetsList2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) +# if maxId == 0: +# dId2SetsList1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) +# maxId = max(dId2SetsList1.keys()) +# for lSets in dId2SetsList2.values(): +# SetUtils.changeIdInList(lSets, maxId) +# lOutSets.extend(lSets) +# maxId += 1 +# return lOutSets, maxId + + ## Return the sum of the length of each Set instance in the given list + # + # @param lSets: list of Set instances + # + def getCumulLength(lSets): + length = 0 + for i in lSets: + length += i.getLength() + return length + + getCumulLength = staticmethod( getCumulLength ) + + ## Return a tuple with min and max coordinates of Set instances in the given list + # + # @param lSets list of Set instances + # + def getListBoundaries(lSets): + qmin = -1 + qmax = -1 + for iSet in lSets: + if qmin == -1: + qmin = iSet.start + qmin = min(qmin, iSet.getMin()) + qmax = max(qmax, iSet.getMax()) + return (qmin, qmax) + + getListBoundaries = staticmethod( getListBoundaries ) + + ## Show Set instances contained in the given list + # + # @param lSets list of Set instances + # + def showList(lSets): + for iSet in lSets: + iSet.show() + + showList = staticmethod( showList ) + + ## Write Set instances contained in the given list + # + # @param lSets list of Set instances + # @param fileName a file name + # @param mode the open mode of the file '"w"' or '"a"' + # + def writeListInFile(lSets, fileName, mode="w"): + fileHandler = open(fileName, mode) + for iSet in lSets: + iSet.write(fileHandler) + fileHandler.close() + + writeListInFile = staticmethod( writeListInFile ) + + ## Split a Set list in several Set lists according to the identifier + # + # @param lSets list of Set instances + # @return a dictionary which keys are identifiers and values Set lists + # + def getDictOfListsWithIdAsKey(lSets): + dId2SetList = {} + for iSet in lSets: + if dId2SetList.has_key(iSet.id): + dId2SetList[iSet.id].append(iSet) + else: + dId2SetList[iSet.id] = [iSet] + return dId2SetList + + getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) + + + ## Split a Set list in several Set lists according to the identifier + # + # @param lSets list of Set instances + # @return a dictionary which keys are identifiers and values Set lists + # + def getDictOfListsWithIdAsKeyFromFile( setFile ): + dId2SetList = {} + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromTuple( line[:-1].split("\t") ) + if not dId2SetList.has_key( iSet.id ): + dId2SetList[ iSet.id ] = [] + dId2SetList[ iSet.id ].append( iSet ) + setFileHandler.close() + return dId2SetList + + getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) + + + ## Return a Map list from the given Set List + # + # @param lSets list of Set instances + # + def getMapListFromSetList(lSets): + lMaps = [] + for iSet in lSets: + lMaps.append(iSet.set2map()) + return lMaps + + getMapListFromSetList = staticmethod( getMapListFromSetList ) + + ## Construct a Set list from a Map list + # + # @param lMaps list of Map instances + # + def getSetListFromMapList(lMaps): + lSets = [] + c = 0 + for iMap in lMaps: + c += 1 + lSets.append( Set(c, iMap.name, iMap.seqname, iMap.start, iMap.end) ) + return lSets + + getSetListFromMapList = staticmethod( getSetListFromMapList ) + + ## Merge all overlapping Set instances in a list without considering the identifiers. + # Start by sorting Set instances by their increasing Min coordinate. + # + # @return: a new list of the merged Set instances + # + def mergeSetsInList(lSets): + l=[] + if len(lSets)==0: + return l + + lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenInvLength( lSets ) + + prev_count = 0 + for iSet in lSortedSets[0:]: + if prev_count != len(lSortedSets): + for i in lSortedSets[ prev_count + 1: ]: + if iSet.isOverlapping( i ): + iSet.merge( i ) + IsAlreadyInList = False + for newSet in l: + if newSet.isOverlapping( iSet ): + IsAlreadyInList = True + newSet.merge( iSet ) + l [ l.index( newSet ) ] = newSet + if not IsAlreadyInList: + l.append( iSet ) + prev_count += 1 + return l + + mergeSetsInList = staticmethod( mergeSetsInList ) + + ## Unjoin a Set list according to another + # + # @param lToKeep: a list of Set instances to keep + # @param lToUnjoin: a list of Set instances to unjoin + # @return: lToUnjoin split in several list + # + def getSetListUnjoined(lToKeep, lToUnjoin): + lSortedToKeep = SetUtils.getSetListSortedByIncreasingMinThenMax( lToKeep ) + lSortedToUnjoin = SetUtils.getSetListSortedByIncreasingMinThenMax( lToUnjoin ) + if lSortedToUnjoin == []: + return [] + if lSortedToKeep == []: + return [ lSortedToUnjoin ] + + i=0 + resultListSet=[] + while i lSortedToUnjoin[j1].getMax(): + j1+=1 + if j1==len(lSortedToUnjoin): + break + if j1!=0: + resultListSet.append(lSortedToUnjoin[:j1]) + del lSortedToUnjoin[:j1] + j1=0 + if i+1==len(lSortedToKeep): + break + j2=j1 + if j2 lSortedToUnjoin[j2].getMax(): + while j2 lSortedToUnjoin[j2].getMax(): + j2+=1 + resultListSet.append(lSortedToUnjoin[j1:j2]) + del lSortedToUnjoin[j1:j2] + i+=1 + + if resultListSet!=[] or i == 0: + resultListSet.append(lSortedToUnjoin) + return resultListSet + + getSetListUnjoined = staticmethod(getSetListUnjoined) + + ## Return new list of Set instances with no duplicate + # + # @param lSets list of Set instances + # + def getSetListWithoutDuplicates( lSets ): + if len(lSets) < 2: + return lSets + lSortedSet = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets ) + lUniqSet = [ lSortedSet[0] ] + for iSet in lSortedSet[1:]: + if iSet != lUniqSet[-1]: + lUniqSet.append( iSet ) + return lUniqSet + + getSetListWithoutDuplicates = staticmethod( getSetListWithoutDuplicates ) + + ## Return a list of Set instances sorted in increasing order according to the Min, then the Max, and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedByIncreasingMinThenMax( lSets ): + return sorted( lSets, key=lambda iSet: ( iSet.getMin(), iSet.getMax() ) ) + + getSetListSortedByIncreasingMinThenMax = staticmethod( getSetListSortedByIncreasingMinThenMax ) + + ## Return a list of Set instances sorted in increasing order according to the min, then the inverse of the length, and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedByIncreasingMinThenInvLength( lSets ): + return sorted( lSets, key=lambda iSet: ( iSet.getMin(), 1 / float(iSet.getLength()) ) ) + + getSetListSortedByIncreasingMinThenInvLength = staticmethod( getSetListSortedByIncreasingMinThenInvLength ) + + ## Return a list of Set instances sorted in increasing order according to the SeqName, then the Name, then the Min, then the Max and finally their initial order + # + # @param lSets: list of Set instances + # + def getSetListSortedBySeqThenRegionThenMinThenMax(lSets): + return sorted(lSets, key=lambda iSet: (iSet.getSeqname(), iSet.getName(), iSet.getMin(), iSet.getMax())) + + getSetListSortedBySeqThenRegionThenMinThenMax = staticmethod(getSetListSortedBySeqThenRegionThenMinThenMax) + + ## Return a list of identifier lists of overlapping Sets from the subject list, according to the reference list + # + # @param lRef list of Set instances + # @param lSubject list of Set instances + # + def getListOfIdListOfOverlappingSets(lRef,lSubject): + lSortedRef = SetUtils.getSetListSortedByIncreasingMinThenMax( lRef ) + lSortedSubject = SetUtils.getSetListSortedByIncreasingMinThenMax( lSubject ) + + lOverlappingSet = [] + lOverlappingSetCounter = 0 + + id2LOverlappingSet_pos = {} + + i = 0 + j = 0 + while i!= len(lSortedRef): + while j!= len(lSortedSubject) and lSortedRef[i].getMin()>lSortedSubject[j].getMax()\ + and not(lSortedRef[i].isOverlapping(lSortedSubject[j])\ + and lSortedRef[i].isOnDirectStrand()==lSortedSubject[j].isOnDirectStrand()): + j+=1 + jj=j + while jj!= len(lSortedSubject) and lSortedRef[i].isOverlapping(lSortedSubject[jj])\ + and lSortedRef[i].isOnDirectStrand()==lSortedSubject[jj].isOnDirectStrand(): + id1=lSortedRef[i].id + id2=lSortedSubject[jj].id*-1 + if id2LOverlappingSet_pos.has_key(id1) \ + and not id2LOverlappingSet_pos.has_key(id2): + lOverlappingSet[id2LOverlappingSet_pos[id1]].append(id2) + id2LOverlappingSet_pos[id2]=id2LOverlappingSet_pos[id1] + if id2LOverlappingSet_pos.has_key(id2) \ + and not id2LOverlappingSet_pos.has_key(id1): + lOverlappingSet[id2LOverlappingSet_pos[id2]].append(id1) + id2LOverlappingSet_pos[id1]=id2LOverlappingSet_pos[id2] + if not id2LOverlappingSet_pos.has_key(id2) \ + and not id2LOverlappingSet_pos.has_key(id1): + lOverlappingSet.append([id1,id2]) + id2LOverlappingSet_pos[id1]=lOverlappingSetCounter + id2LOverlappingSet_pos[id2]=lOverlappingSetCounter + lOverlappingSetCounter+=1 + jj+=1 + i+=1 + + return lOverlappingSet + + getListOfIdListOfOverlappingSets = staticmethod (getListOfIdListOfOverlappingSets) + + ## Return a list of sets without overlapping between two lists of sets + # + # @param lSet1 and lSet2 + # + def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2): + for i in lSet1: + for idx,j in enumerate(lSet2): + n=j.diff(i) + if not n.isEmpty() and n.getLength()>=20: + lSet2.append(n) + lSet2WithoutOverlaps=[] + for i in lSet2: + if not i.isEmpty() and i.getLength()>=20: + lSet2WithoutOverlaps.append(i) + return lSet2WithoutOverlaps + + getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet) + + ## Return a Set list from a Set file + # + # @param setFile string name of a Set file + # @return a list of Set instances + # + def getSetListFromFile( setFile ): + lSets = [] + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromString( line ) + lSets.append( iSet ) + setFileHandler.close() + return lSets + + getSetListFromFile = staticmethod( getSetListFromFile ) + + + def convertSetFileIntoMapFile( setFile, mapFile ): + setFileHandler = open( setFile, "r" ) + mapFileHandler = open( mapFile, "w" ) + iSet = Set() + while True: + line = setFileHandler.readline() + if line == "": + break + iSet.setFromString( line ) + iMap = iSet.getMapInstance() + iMap.write( mapFileHandler ) + setFileHandler.close() + mapFileHandler.close() + + convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile ) + + + def getDictOfListsWithSeqnameAsKey( lSets ): + dSeqnamesToSetList = {} + for iSet in lSets: + if not dSeqnamesToSetList.has_key( iSet.seqname ): + dSeqnamesToSetList[ iSet.seqname ] = [] + dSeqnamesToSetList[ iSet.seqname ].append( iSet ) + return dSeqnamesToSetList + + getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey ) + + + def filterOnLength( lSets, minLength=0, maxLength=10000000000 ): + if minLength == 0 and maxLength == 0: + return lSets + lFiltered = [] + for iSet in lSets: + if minLength <= iSet.getLength() <= maxLength: + lFiltered.append( iSet ) + return lFiltered + + filterOnLength = staticmethod( filterOnLength ) + + + def getListOfNames( setFile ): + lNames = [] + setFileHandler = open( setFile, "r" ) + iSet = Set() + while True: + line = setFileHandler.readline() + if line == "": + break + iSet.setFromTuple( line[:-1].split("\t") ) + if iSet.name not in lNames: + lNames.append( iSet.name ) + setFileHandler.close() + return lNames + + getListOfNames = staticmethod( getListOfNames ) + + + def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ): + dNames2DictsId = {} + setFileHandler = open( setFile, "r" ) + while True: + line = setFileHandler.readline() + if line == "": + break + iSet = Set() + iSet.setFromTuple( line[:-1].split("\t") ) + if not dNames2DictsId.has_key( iSet.name ): + dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] } + else: + if not dNames2DictsId[ iSet.name ].has_key( iSet.id ): + dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ] + else: + dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet ) + setFileHandler.close() + return dNames2DictsId + + getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile ) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/SlidingWindow.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/SlidingWindow.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,73 @@ +class SlidingWindow(object): + + def __init__( self, length = 1, overlap = 1 ): + self._length = length + self._overlap = overlap + self._start = 1 + self._end = length + self._step = length - overlap + + def slideWindowOnce(self): + self._start = self._start + self._step + self._end = self._end + self._step + + def getStart(self): + return self._start + + def getEnd(self): + return self._end + + def setStart(self, start): + self._start = start + + def setEnd(self, end): + self._end = end + + def getLength(self): + return self._length + + def getOverlap(self): + return self._overlap + + def setLength(self, length): + self._length = length + + def setOverlap(self, overlap): + self._overlap = overlap + + def getSlidingMsg(self): + return "Window is sliding : %s %s" %(self._start, self._end) + +class SlidingWindowToCountMatchingBases(SlidingWindow): + + def getSetLengthOnWindow( self, iSet ): + if self._isSetIncludedInTheWindow(iSet): + return iSet.getLength() + if self._isWindowIncludedInTheSet(iSet): + return self._length + elif self._isSetOverlapTheRightSideOfTheWindow(iSet): + return self._end - iSet.getMin()+1 + elif self._isSetOverlapTheLeftSideOfTheWindow(iSet): + return iSet.getMax() - self._start+1 + + def getCoordSetOnWindow( self, iSet ): + if self._isSetIncludedInTheWindow(iSet): + return iSet.getStart(), iSet.getEnd() + if self._isWindowIncludedInTheSet(iSet): + return self.getStart(), self.getEnd() + elif self._isSetOverlapTheRightSideOfTheWindow(iSet): + return iSet.getStart(), self.getEnd() + elif self._isSetOverlapTheLeftSideOfTheWindow(iSet): + return self.getStart(), iSet.getEnd() + + def _isSetIncludedInTheWindow(self, feature): + return feature.getMin() >= self._start and feature.getMax() <= self._end + + def _isWindowIncludedInTheSet(self, feature): + return self._start >= feature.getMin() and self._end <= feature.getMax() + + def _isSetOverlapTheRightSideOfTheWindow(self, feature): + return feature.getMin() <= self._end and feature.getMin() >= self._start + + def _isSetOverlapTheLeftSideOfTheWindow(self, feature): + return feature.getMax() <= self._end and feature.getMax() >= self._start diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/coord/align2set.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/align2set.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +import sys +import getopt +from commons.core.coord.Align import Align + +def help(): + print + print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) + print "options:" + print " -h: this help" + print " -i: input file name (format='align')" + print " -o: output file name (format='set', default=inFileName+'.set')" + print " -v: verbosity level (default=0/1)" + print + + +def align2set( inFileName, outFileName ): + alignFileHandler = open( inFileName, "r" ) + setFileHandler = open( outFileName, "w" ) + iAlign = Align() + countAlign = 0 + while True: + line = alignFileHandler.readline() + if line == "": + break + countAlign += 1 + iAlign.setFromString( line, "\t" ) + setFileHandler.write( "%i\t%s\t%s\t%i\t%i\n" % ( countAlign, + iAlign.getSubjectName(), + iAlign.getQueryName(), + iAlign.getQueryStart(), + iAlign.getQueryEnd() ) ) + alignFileHandler.close() + setFileHandler.close() + + +def main(): + + inFileName = "" + outFileName = "" + verbose = 0 + + try: + opts, args = getopt.getopt( sys.argv[1:], "hi:o:v:" ) + except getopt.GetoptError, err: + print str(err) + help() + sys.exit(1) + for o,a in opts: + if o == "-h": + help() + sys.exit(0) + elif o == "-i": + inFileName = a + elif o == "-o": + outFileName = a + elif o == "-v": + verbose = int(a) + + if inFileName == "": + print "ERROR: missing input file name" + help() + sys.exit(1) + + if verbose > 0: + print "START %s" % ( sys.argv[0].split("/")[-1] ) + sys.stdout.flush() + + if outFileName == "": + outFileName = "%s.set" % ( inFileName ) + +#TODO: move 'align2set' into 'AlignUtils.convertAlignFileIntoPSetFile' with a test +# AlignUtils.convertAlignFileIntoPSetFile( inFileName, outFileName ) + + align2set( inFileName, outFileName ) + + if verbose > 0: + print "END %s" % ( sys.argv[0].split("/")[-1] ) + sys.stdout.flush() + + return 0 + + +if __name__ == "__main__": + main() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/.BamParser.py.swp Binary file commons/core/parsing/.BamParser.py.swp has changed diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/AxtParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/AxtParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,140 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.misc.Utils import getHammingDistance + + +class AxtParser(MapperParser): + """A class that parses AXT (as given by Mosaik)""" + + def __init__(self, fileName, verbosity = 0): + super(AxtParser, self).__init__(fileName, verbosity) + self.queryLine = None + self.subjectLine = None + + def __del__(self): + super(AxtParser, self).__del__() + + + def getFileFormats(): + return ["axt"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + cpt = 0 + self.reset() + for line in self.handle: + line = line.strip() + if line == "": continue + if cpt % 3 == 0: + line = line.strip() + parts = line.split(" ") + self.chromosomes.add(parts[1]) + self.size += int(parts[6]) + self.nbMappings += 1 + cpt += 1 + if self.verbosity >= 10 and self.nbMappings % 100000 == 0: + sys.stdout.write(" %d mappings read\r" % (self.nbMappings)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d mappings read" % (self.nbMappings) + print "Done." + + + def parseLine(self, line): + + if line.strip() == "": + for line in self.handle: + self.currentLineNb += 1 + break + if line.strip() == "": + return None + + m = re.search(r"^\s*\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s*$", line) + if m != None: + mapping = Mapping() + subMapping = SubMapping() + + subMapping.queryInterval.setName(m.group(4)) + subMapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6)))) + subMapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6)))) + subMapping.queryInterval.setDirection(m.group(7)) + + subMapping.targetInterval.setChromosome(m.group(1)) + subMapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) + subMapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) + subMapping.targetInterval.setDirection(1) + + subMapping.setSize(min(subMapping.targetInterval.getSize(), subMapping.queryInterval.getSize())) + subMapping.setDirection(m.group(7)) + + mapping.addSubMapping(subMapping) + + mapping.setDirection(m.group(7)) + mapping.targetInterval.setChromosome(m.group(1)) + mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) + mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) + + mapping.queryInterval.setName(m.group(4)) + mapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6)))) + mapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6)))) + + mapping.setSize(min(mapping.targetInterval.getSize(), mapping.queryInterval.getSize())) + + self.currentMapping = mapping + return None + if self.queryLine == None: + self.queryLine = line + return None + self.subjectLine = line + seqLen = float(len(self.subjectLine)) + dist = float(getHammingDistance(self.queryLine, self.subjectLine)) + self.currentMapping.setNbMismatches(getHammingDistance(self.queryLine, self.subjectLine)) + self.currentMapping.setNbGaps(0) + self.queryLine = None + self.subjectLine = None + return self.currentMapping + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BamParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BamParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,483 @@ +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re, sys, gzip, struct +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Interval import Interval + + +BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN" + +BAM_CIGAR_LOOKUP = "MIDNSHP=X" +BAM_CIGAR_SHIFT = 4 +BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1) + + + +def pack_int32(x): + return struct.pack('> BAM_CIGAR_SHIFT + self._type = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ] + + +class CigarData(object): + def __init__(self, data, num_ops): + self._ops = [] + for i in range(num_ops): + cigar_data = unpack_uint32(data[i*4: (i+1)*4]) + self._ops.append(CigarOp(cigar_data)) + + def getCigarData(self): + return self._ops + + def __str__(self): + return "".join(["%d%s" % (op._length, op._type) for op in self._ops]) + + +class TagsData(object): + def __init__(self): + self._tags = {} + + def add(self, tag): + self._tags[tag._tag] = tag + + def getTags(self): + return self._tags + + def __str__(self): + return " ".join([self._tags[tag] for tag in sorted(self._tags.keys())]) + + +class TagData(object): + def __init__(self, tag, type, value): + self._tag = tag + self._type = type + self._value = value + + def __str__(self): + if self._type in "AZHB": + return "%s:%s:%s" % (self._tag, self._type, self._value) + if self._type in "cCsSiI": + return "%s:%s:%d" % (self._tag, self._type, self._value) + return "%s:%s:%f" % (self._tag, self._type, self._value) + + +class TagParser(object): + def __init__(self, data): + self._data = data + self._tags = TagsData() + self._parse() + + def _parse(self): + while self._data: + tag = "%s%s" % (chr(unpack_int8(self._data[0])), chr(unpack_int8(self._data[1]))) + type = chr(unpack_int8(self._data[2])) + self._data = self._data[3:] + if type in BAM_TAG_VALUE: + value = self._parseUnique(type) + elif type == "Z": + value = self._parseString() + elif type == "H": + size = unpack_int8(self._data[0]) + self._data = self._data[1:] + value = self._parseSeveral("C", size) + elif type == "B": + secondType = unpack_int8(self._data[0]) + size = unpack_int8(self._data[1]) + unpack_int8(self._data[2]) * 16 + unpack_int8(self._data[3]) * 16 * 16 + unpack_int8(self._data[4]) * 16 * 16 * 16 + self._data = self._data[5:] + value = self._parseSeveral(secondType, size) + else: + raise Exception("Cannot parse type '%s'." % (type)) + fullTag = TagData(tag, type, value) + self._tags.add(fullTag) + + def _parseUnique(self, type): + value = BAM_TAG_CODE[type](self._data[:BAM_TAG_SIZE[type]]) + self._data = self._data[BAM_TAG_SIZE[type]:] + return value + + def _parseSeveral(self, type, size): + value = [] + for i in range(size): + value.append(self._parseUnique(type)) + return value + + def _parseString(self): + value = "" + char = self._data[0] + self._data = self._data[1:] + while unpack_int8(char) != 0: + value += char + char = self._data[0] + self._data = self._data[1:] + return value + + def getTags(self): + return self._tags.getTags() + + def __str__(self): + return self._tags + + +class AlignedRead(object): + def __init__(self, data, refs): + self._data = data + self._refs = refs + + def parse(self): + self._parse_common() + self._parse_flag_nc() + self._parse_bin_mq_nl() + self._parse_name() + self._parse_cigar() + self._parse_sequence() + self._parse_quality() + self._parse_tags() + + def _parse_common(self): + ref_id = unpack_int32(self._data[0:4]) + self._chromosome = self._refs[ref_id] + self._pos = unpack_int32(self._data[4:8]) + 1 + mate_ref_id = unpack_int32(self._data[20:24]) + if mate_ref_id == -1: + self._rnext = "*" + else: + self._rnext = self._refs[mate_ref_id] + if self._rnext == self._chromosome: + self._rnext = "=" + self._pnext = unpack_int32(self._data[24:28]) + 1 + self._tlen = unpack_int32(self._data[28:32]) + + def _parse_bin_mq_nl(self): + bin_mq_nl = unpack_uint32(self._data[8:12]) + self._bin = bin_mq_nl >> 16 + self._mappingQuality = bin_mq_nl >> 8 & 0xff + self._query_name_length = bin_mq_nl & 0xff + + def _parse_flag_nc(self): + flag_nc = unpack_uint32(self._data[12:16]) + self._flag = flag_nc >> 16 + self._num_cigar_ops = flag_nc & 0xffff + + def _parse_name(self): + start = 32 + stop = start + self._query_name_length + self._name = unpack_string(self._data[start:stop]) + + def _parse_cigar(self): + start = 32 + self._query_name_length + stop = start + (self._num_cigar_ops * 4) + _buffer = self._data[start:stop] + cigar = CigarData(_buffer, self._num_cigar_ops) + self._cigar = cigar.getCigarData() + + def _parse_sequence(self): + seq_length = unpack_int32(self._data[16:20]) + start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + stop = start + (seq_length + 1) / 2 + _buffer = self._data[start:stop] + self._sequence = "" + for i in range(seq_length): + x = unpack_uint8(_buffer[(i / 2)]) + index = (x >> (4 * (1 - (i % 2)))) & 0xf + base = BAM_DNA_LOOKUP[index] + self._sequence += base + + def _parse_quality(self): + seq_length = unpack_int32(self._data[16:20]) + start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + stop = start + seq_length + _buffer = self._data[start:stop] + self._quality = "".join(["%s" % (chr(unpack_int8(x) + 33)) for x in _buffer]) + + def _parse_tags(self): + seq_length = unpack_int32(self._data[16:20]) + start = 32 + self._query_name_length + (self._num_cigar_ops * 4) + (seq_length + 1) / 2 + (seq_length + 1) - 1 + stop = start + seq_length + _buffer = self._data[start:] + tagParser = TagParser(_buffer) + self._tags = tagParser.getTags() + + +class FileReader(object): + + def __init__(self, handle): + self._handle = handle + self._readHeader() + + def _readHeader(self): + magic = unpack_string(self._handle.read(4)) + if magic != "BAM\1": + raise Exception("File should start with 'BAM\1', starting with '%s' instead." % (magic)) + tlen = unpack_int32(self._handle.read(4)) + text = unpack_string(self._handle.read(tlen)) + nrefs = unpack_int32(self._handle.read(4)) + self._refs = [] + for i in range(nrefs): + sizeName = unpack_int32(self._handle.read(4)) + name = unpack_string(self._handle.read(sizeName)) + size = unpack_int32(self._handle.read(4)) + self._refs.append(name) + self._startPos = self._handle.tell() + + def reset(self): + self._handle.seek(self._startPos) + + def getNextAlignment(self): + try: + blockSize = unpack_int32(self._handle.read(4)) + except struct.error: + return False + block = self._handle.read(blockSize) + currentRead = AlignedRead(block, self._refs) + return currentRead + + + +def parseAlignedRead(read): + if (read._flag & 0x4) == 0x4: + return None + + mapping = Mapping() + direction = 1 if (read._flag & 0x10) == 0x0 else -1 + genomeStart = read._pos + nbOccurrences = 1 + nbMismatches = 0 + nbMatches = 0 + nbGaps = 0 + subMapping = None + queryOffset = 0 + targetOffset = 0 + readStart = None + + for tag, value in read._tags.iteritems(): + if tag == "X0": + nbOccurrences = value._value + elif tag == "X1": + nbOccurrences += value._value + elif tag == "XM": + nbMismatches = value._value + mapping.setTagValue("nbOccurrences", nbOccurrences) + mapping.setTagValue("quality", read._mappingQuality) + + for operation in read._cigar: + if operation._type == "M": + if readStart == None: + readStart = queryOffset + if subMapping == None: + subMapping = SubMapping() + subMapping.setSize(operation._length) + subMapping.setDirection(direction) + subMapping.queryInterval.setName(read._name) + subMapping.queryInterval.setStart(queryOffset) + subMapping.queryInterval.setDirection(direction) + subMapping.targetInterval.setChromosome(read._chromosome) + subMapping.targetInterval.setStart(genomeStart + targetOffset) + subMapping.targetInterval.setDirection(1) + nbMatches += operation._length + targetOffset += operation._length + queryOffset += operation._length + currentNumber = 0 + continue + if operation._type == "I": + nbGaps += 1 + queryOffset += operation._length + currentNumber = 0 + continue + if operation._type == "D": + if subMapping != None: + subMapping.queryInterval.setEnd(queryOffset - 1) + subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + subMapping = None + nbGaps += 1 + targetOffset += operation._length + currentNumber = 0 + continue + if operation._type == "N": + if subMapping != None: + subMapping.queryInterval.setEnd(queryOffset - 1) + subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + subMapping = None + targetOffset += operation._length + currentNumber = 0 + continue + if operation._type == "S": + nbMismatches += operation._length + targetOffset += operation._length + queryOffset += operation._length + currentNumber = 0 + continue + if operation._type == "H": + targetOffset += operation._length + queryOffset += operation._length + currentNumber = 0 + continue + if operation._type == "P": + continue + raise Exception("Do not understand parameter '%s'" % (operation._type)) + + if subMapping != None: + subMapping.queryInterval.setEnd(queryOffset - 1) + subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + mapping.queryInterval.setStart(readStart) + mapping.queryInterval.setEnd(queryOffset - 1) + mapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(nbGaps) + mapping.queryInterval.setName(read._name) + mapping.queryInterval.setDirection(direction) + mapping.targetInterval.setChromosome(read._chromosome) + mapping.targetInterval.setStart(genomeStart) + mapping.targetInterval.setDirection(direction) + mapping.setSize(len(read._sequence)) + mapping.setDirection(direction) + return mapping + + +class BamParser(MapperParser): + """A class that parses BAM format""" + + def __init__(self, fileName, verbosity = 0): + self.verbosity = verbosity + self.handle = gzip.open(fileName, "rb") + self.reader = FileReader(self.handle) + self.nbMappings = None + self.fileName = fileName + + + def __del__(self): + self.handle.close() + + + def getFileFormats(): + return ["bam"] + getFileFormats = staticmethod(getFileFormats) + + + def reset(self): + self.reader.reset() + + + def getNextMapping(self): + self.currentMapping = None + while self.currentMapping == None: + read = self.reader.getNextAlignment() + if not read: + self.currentMapping = False + return False + read.parse() + self.currentMapping = parseAlignedRead(read) + return self.currentMapping + + + def setDefaultTagValue(self, name, value): + pass + + + def skipFirstLines(self): + pass diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BedParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BedParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,139 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript + + +class BedParser(TranscriptListParser): + """A class that parses a BED file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + self.title = None + TranscriptListParser.__init__(self, fileName, verbosity) + + +# def __del__(self): +# super(BedParser, self).__del__() + + + def getFileFormats(): + return ["bed"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + mark = self.handle.tell() + line = self.handle.readline() + line = line.strip() + m = re.search(r"^\s*track\s+name\s*=\s*(\S+)\s+", line) + if m != None: + self.title = m.group(1) + self.currentLineNb += 1 + else: + self.handle.seek(mark) + return + + + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\s*$", line) + if m != None: + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName("Unnamed") + transcript.setDirection(1) + return transcript + + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\s*$", line) + if m != None: + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName(m.group(4)) + transcript.setDirection(1) + return transcript + + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+\d+\.?\d*\s*$", line) + if m != None: + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName(m.group(4)) + transcript.setDirection(1) + return transcript + + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+\d+\t+([+-])\t+\d+\t+\d+\t+0\t+(\d+)\t+(\S+)\t+(\S+)\s*$", line) + if m == None: + raise Exception("\nLine %d '%s' does not has a BED format." % (self.currentLineNb, line)) + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName(m.group(4)) + transcript.setDirection(m.group(5)) + nbExons = int(m.group(6)) + sizes = m.group(7).split(",") + starts = m.group(8).split(",") + + # check for comment in name + m = re.search(r"^([^\(]*)\((\S+)\)$", transcript.getName()) + if m != None: + transcript.setName(m.group(1)) + transcript.setTagValues(m.group(2), ";", "=") + + # check for nb occurrences in name + m = re.search(r"(.*)-(\d+)$", transcript.getName()) + if m != None: + transcript.setName(m.group(1)) + transcript.setOccurrence(int(m.group(2))) + + for i in range(nbExons): + exon = Interval(transcript) + exon.setStart(int(starts[i])+transcript.getStart()) + exon.setEnd(transcript.getStart()+int(starts[i])+int(sizes[i])-1) + exon.setSize(int(sizes[i])) + transcript.addExon(exon) + + if transcript.exons[0].getStart() != transcript.getStart(): + sys.exit("There is something wrong with the start of transcript line '%s': transcript starts at %d whereas first exon starts at %d" % (line.strip(), transcript.start, transcript.exons[0].start)) + if transcript.exons[-1].getEnd() != transcript.getEnd(): + sys.exit("There is something wrong with the end of transcript line '%s': transcript ends at %d whereas last exon ends at %d" % (line.strip(), transcript.end, transcript.exons[-1].end)) + + return transcript + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BlastParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlastParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,88 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Mapping import Mapping + + +class BlastParser(MapperParser): + """A class that parses the output of Blast (-m 8 format)""" + + def __init__(self, fileName, verbosity = 0): + super(BlastParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(BlastParser, self).__del__() + + + def getFileFormats(): + return ["blast"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^(\S+)\s+(\S+)\s+(\d+\.?\d*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+]?\d+\.?\d*[eE]?[-+]?\d*)\s+(\d+\.?\d*)\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have an Blast format" % (self.currentLineNb, line)) + + mapping = Mapping() + + queryInterval = Interval() + queryInterval.setName(m.group(1)) + queryInterval.setStart(min(int(m.group(7)), int(m.group(8)))) + queryInterval.setEnd(max(int(m.group(7)), int(m.group(8)))) + + targetInterval = Interval() + targetInterval.setChromosome(m.group(2)) + targetInterval.setStart(min(int(m.group(9)), int(m.group(10)))) + targetInterval.setEnd(max(int(m.group(9)), int(m.group(10)))) + + subMapping = SubMapping() + subMapping.setQueryInterval(queryInterval) + subMapping.setTargetInterval(targetInterval) + + mapping.addSubMapping(subMapping) + + mapping.setIdentity(round(float(m.group(3)))) + mapping.setSize(int(m.group(4))) + mapping.setNbMismatches(int(m.group(5))) + mapping.setNbGaps(int(m.group(6))) + mapping.setDirection((int(m.group(8)) - int(m.group(7))) * (int(m.group(10)) - int(m.group(9)))) + mapping.setEvalue(float(m.group(11))) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BlatFileParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatFileParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,63 @@ +from commons.core.parsing.BlatParser import BlatParser +import os + +class BlatFileParser(object): + + def __init__(self, blatFileName = None): + self._blatFileName = blatFileName + self._lBlatHits = [] + self._dBlatHitsByQueries = {} + self._dQueries = {} + + def getDictOfQueries(self): + return self._dQueries + + def getResultLinesOfOneQuery(self, queryName): + return self._dBlatHitsByQueries[queryName] + + def getDictOfBlatHitsByQueries(self): + return self._dBlatHitsByQueries + + def getListsOfHits(self): + return self._lBlatHits + + def parseBlatFile(self): + blatFile = open(self._blatFileName, 'r') + line = blatFile.readline() + n = 1 + while line != "": + if self._isInteger(line.split("\t")[0]): + iBlatParser = BlatParser() + iBlatParser.setAttributesFromString(line, n) + queryHeader = iBlatParser.getQName() + self._dQueries[queryHeader] = 1 + self._lBlatHits.append(iBlatParser) + line = blatFile.readline() + n += 1 + return self._lBlatHits + + def parseBlatFileByQueries(self): + blatFile = open(self._blatFileName, 'r') + line = blatFile.readline() + n = 1 + while line != "": + if self._isInteger(line.split("\t")[0]): + iBlatParser = BlatParser() + iBlatParser.setAttributesFromString(line, n) + queryHeader = iBlatParser.getQName() + self._dQueries[queryHeader] = 1 + if self._dBlatHitsByQueries.has_key(queryHeader): + self._dBlatHitsByQueries[queryHeader].append(iBlatParser) + else: + self._dBlatHitsByQueries[queryHeader] = [iBlatParser] + line = blatFile.readline() + n += 1 + blatFile.close() + return self._dBlatHitsByQueries + + def _isInteger(self, string): + try: + int(string) + return True + except ValueError: + return False diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BlatParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,351 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import sys + +## this class can parse a Blat results output file +# +class BlatParser(object): + + + def __init__(self, match='', mismatch='', repMatch='', N='', QGapCount='', QGapBases='', TGapCount='', TGapBases='', strand='', QName='', QSize='', QStart='', QEnd='', TName='', TSize='', TStart='', TEnd='', blockCount='', blockSizes='', qStarts='', tStarts=''): + self._match = match + self._mismatch = mismatch + self._repMatch = repMatch + self._N = N + self._QGapCount = QGapCount + self._QGapBases = QGapBases + self._TGapCount = TGapCount + self._TGapBases = TGapBases + self._strand = strand + self._QName = QName + self._QSize = QSize + self._QStart = QStart + self._QEnd = QEnd + self._TName = TName + self._TSize = TSize + self._TStart = TStart + self._TEnd = TEnd + self._blockCount = blockCount + self._blockSizes = blockSizes + self._qStarts = qStarts + self._tStarts = tStarts + + def __eq__(self, o): + return self._TName == o._TName and self._TSize == o._TSize and self._TStart == o._TStart and self._TEnd == o._TEnd + + def setMatch(self, match): + self._match = match + + def setMismatch(self, mismatch): + self._mismatch = mismatch + + def setRepMatch(self, repMatch): + self._repMatch = repMatch + + def setN(self, N): + self._N = N + + def setQGapCount(self, QGapCount): + self._QGapCount = QGapCount + + def setQGapBases(self, QGapBases): + self._QGapBases = QGapBases + + def setTGapCount(self, TGapCount): + self._TGapCount = TGapCount + + def setTGapBases(self, TGapBases): + self._TGapBases = TGapBases + + def setStrand(self, strand): + self._strand = strand + + def setQName(self, QName): + self._QName = QName + + def setQSize(self, QSize): + self._QSize = QSize + + def setQStart(self, QStart): + self._QStart = QStart + + def setQEnd(self, QEnd): + self._QEnd = QEnd + + def setTName(self, TName): + self._TName = TName + + def setTSize(self, TSize): + self._TSize = TSize + + def setTStart(self, TStart): + self._TStart = TStart + + def setTEnd(self, TEnd): + self._TEnd = TEnd + + def setBlockCount(self, blockCount): + self._blockCount = blockCount + + def setBlockSizes(self, blockSizes): + self._blockSizes = blockSizes + + def setQStarts(self, qStarts): + self._qStarts = qStarts + + def setTStarts(self, tStarts): + self._tStarts = tStarts + + def getMatch(self): + return self._match + + def getMismatch(self): + return self._mismatch + + def getRepMatch(self): + return self._repMatch + + def getN(self): + return self._N + + def getQGapCount(self): + return self._QGapCount + + def getQGapBases(self): + return self._QGapBases + + def getTGapCount(self): + return self._TGapCount + + def getTGapBases(self): + return self._TGapBases + + def getStrand(self): + return self._strand + + def getQName(self): + return self._QName + + def getQSize(self): + return self._QSize + + def getQStart(self): + return self._QStart + + def getQEnd(self): + return self._QEnd + + def getTName(self): + return self._TName + + def getTSize(self): + return self._TSize + + def getTStart(self): + return self._TStart + + def getTEnd(self): + return self._TEnd + + def getBlockCount(self): + return self._blockCount + + def getBlockSizes(self): + return self._blockSizes + + def getQStarts(self): + return self._qStarts + + def getTStarts(self): + return self._tStarts + + def setAttributes(self, lResults, iCurrentLineNumber): + error = False + + if lResults[0] != '': + self.setMatch(lResults[0]) + else: + sys.stderr.write("WARNING: The field Match is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[1] != '': + self.setMismatch(lResults[1]) + else: + sys.stderr.write("WARNING: The field Mismatch is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[2] != '': + self.setRepMatch(lResults[2]) + else: + sys.stderr.write("WARNING: The field RepMatch is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[3] != '': + self.setN(lResults[3]) + else: + sys.stderr.write("WARNING: The field N is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[4] != '': + self.setQGapCount(lResults[4]) + else: + sys.stderr.write("WARNING: The field QGapCount is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[5] != '': + self.setQGapBases(lResults[5]) + else: + sys.stderr.write("WARNING: The field QGapBases is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[6] != '': + self.setTGapCount(lResults[6]) + else: + sys.stderr.write("WARNING: The field TGapCount is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[7] != '': + self.setTGapBases(lResults[7]) + else: + sys.stderr.write("WARNING: The field TGapBases is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[8] != '': + self.setStrand(lResults[8]) + else: + sys.stderr.write("WARNING: The field Strand is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[9] != '': + self.setQName(lResults[9]) + else: + sys.stderr.write("WARNING: The field QName is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[10] != '': + self.setQSize(lResults[10]) + else: + sys.stderr.write("WARNING: The field QSize is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[11] != '': + self.setQStart(lResults[11]) + else: + sys.stderr.write("WARNING: The field QStart is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[12] != '': + self.setQEnd(lResults[12]) + else: + sys.stderr.write("WARNING: The field QEnd is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[13] != '': + self.setTName(lResults[13]) + else: + sys.stderr.write("WARNING: The field TName is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[14] != '': + self.setTSize(lResults[14]) + else: + sys.stderr.write("WARNING: The field TSize is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[15] != '': + self.setTStart(lResults[15]) + else: + sys.stderr.write("WARNING: The field TStart is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[16] != '': + self.setTEnd(lResults[16]) + else: + sys.stderr.write("WARNING: The field TEnd is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[17] != '': + self.setBlockCount(lResults[17]) + else: + sys.stderr.write("WARNING: The field BlockCount is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[18] != '': + self.setBlockSizes(lResults[18]) + else: + sys.stderr.write("WARNING: The field BlockSizes is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[19] != '': + self.setQStarts(lResults[19]) + else: + sys.stderr.write("WARNING: The field QStarts is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[20] != '': + self.setTStarts(lResults[20]) + else: + sys.stderr.write("WARNING: The field TStarts is empty in blat file in line %s\n" % iCurrentLineNumber) + error = True + + if error == True: + self._setAllToNull() + + def setAttributesFromString(self, blatLine, iCurrentLineNumber ="", fieldSeparator ="\t"): + blatLine = blatLine.rstrip() + lBlatLineItem = blatLine.split(fieldSeparator) + if not len(lBlatLineItem) == 21: + sys.stderr.write("WARNING: The line %s is not valid blat line (%s columns -> 21 columns needed)\n" % (iCurrentLineNumber, len(lBlatLineItem))) + else: + self.setAttributes(lBlatLineItem, iCurrentLineNumber) + + def _setAllToNull(self): + self._match = '' + self._mismatch = '' + self._repMatch = '' + self._N = '' + self._QGapCount = '' + self._QGapBases = '' + self._TGapCount = '' + self._TGapBases = '' + self._strand = '' + self._QName = '' + self._QSize = '' + self._QStart = '' + self._QEnd = '' + self._TName = '' + self._TSize = '' + self._TStart = '' + self._TEnd = '' + self._blockCount = '' + self._blockSizes = '' + self._qStarts = '' + self._tStarts = '' \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BlatToGff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatToGff.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,116 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import optparse +import os +from commons.core.parsing.BlatParser import BlatParser + +class BlatToGff(object): + + + def __init__(self): + pass + + def setAttributesFromCmdLine(self): + help = '\ + \nThis Script Launch BlatToGff.\n\n\ + Example 1: python BlatToGff.py -i blatResultsFile.tab -o outputFile.gff3\n\n' + parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0") + parser.add_option( '-i', '--input', dest='inputBLAT', help='Blat Input File Name [Format: tabular]', default= None ) + parser.add_option( '-o', '--output', dest='output', help='Output File Name [Format: GFF3]', default= None ) + parser.add_option( '-n', '--methodname', dest='methodName', help='Method name in col. 3 [Default: None]', default= None ) + ( options, args ) = parser.parse_args() + self._options = options + + def checkOptions(self): + if self._options.inputBLAT == '': + raise Exception("ERROR: No Blat file specified for -i !") + elif not os.path.exists(self._options.inputBLAT): + raise Exception("ERROR: Blat Input File doesn't exist !") + else: + self._inputFileBlat = self._options.inputBLAT + + if self._options.output == '': + raise Exception("ERROR: No Output file specified for -o !") + else: + self._outputFileGFF = self._options.output + + self._methodName = self._options.methodName + + def run(self): + self.checkOptions() + self._createGFFOutputFile() + BLATFile = open(self._inputFileBlat, 'r') + + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + blatLine = BLATFile.readline() + numberLine = 6 + while blatLine != '': + gffLine = self.convertBlatObjectToGffLine(blatLine, numberLine) + self._printGFFLinesToOutputFile(gffLine) + blatLine = BLATFile.readline() + numberLine = numberLine + 1 + + def convertBlatObjectToGffLine(self, blatLine, numberLine): + iBlatHit = BlatParser() + iBlatHit.setAttributesFromString(blatLine, numberLine) + col1 = iBlatHit.getTName() + col2 = 'BlatToGff' + if self._methodName == '' or self._methodName == None: + col3 = 'BES' + else: + col3 = '%s:BES' % self._methodName + col4 = iBlatHit.getTStart() + col5 = iBlatHit.getTEnd() + col6 = '.' + col7 = '+' + col8 = '.' + col9 = 'ID=%s;Name=%s;bes_start=%s;bes_end=%s;bes_size=%s' % (iBlatHit.getQName(), iBlatHit.getQName(), iBlatHit.getTStart(), iBlatHit.getTEnd(), iBlatHit.getTSize()) + gffLine = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (col1, col2, col3, col4, col5, col6, col7, col8, col9) + return gffLine + + def _createGFFOutputFile(self): + GFFfile = open(self._outputFileGFF, 'w') + GFFfile.write("##gff-version 3\n") + GFFfile.close() + + def _printGFFLinesToOutputFile(self, line): + GFFfile = open(self._outputFileGFF, 'a') + GFFfile.write(line) + GFFfile.close() + +if __name__ == '__main__': + iBlatToGff = BlatToGff() + iBlatToGff.setAttributesFromCmdLine() + iBlatToGff.run() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BlatToGffForBesPaired.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatToGffForBesPaired.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,266 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import optparse +import os +import sys +import re +import datetime +from commons.core.parsing.BlatParser import BlatParser +from commons.core.seq.FastaUtils import FastaUtils + +class BlatToGffForBesPaired(object): + + + def __init__(self): + pass + + def setAttributesFromCmdLine(self): + help = '\ + \nThis Script Launch BlatToGffForBesPaired.\n\n\ + Example 1: python BlatToGffForBesPaired.py -i blatResultsFile.tab -f besSequences.fasta -o outputFile.gff3\n\ + Example 2: python BlatToGffForBesPaired.py -i blatResultsFile.tab -f besSequences.fasta -o outputFile.gff3 -n muscadine:filtre1\n\n\ + Note 1: In blat input file, all BAC-Ends must be paired. In addition, they must be one above the other.\nFor example, if you have the BES MRRE1H032F08FM1 (forward), we must have the BES MRRE1H032F08RM1 (reverse) just after, like:\n\ + 554\t26\t0\t0\t1\t16\t1\t17\t+\tMRRE1H032F08FM1\t606\t10\t606\tchr11\t19818926\t3725876\t3726473\t2\t553,27,\t10,579,\t3725876,3726446,\n\ + 620\t23\t0\t0\t0\t0\t0\t0\t-\tMRRE1H032F08RM1\t643\t0\t643\tchr11\t19818926\t3794984\t3795627\t1\t643,\t0,\t3794984,\n\ + Note 2: the header in Blat results output file must be present (5 lines).\n\n' + + parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0") + parser.add_option( '-i', '--input', dest='inputBLAT', help='Blat Input File Name, with BES paired (1 Forward and 1 Reverse) [Format: tabular]', default= None ) + parser.add_option( '-f', '--fasta', dest='inputFASTA', help='Fasta Input File Name, with all sequences of BES [Format: fasta]', default= None ) + parser.add_option( '-o', '--output', dest='output', help='Output File Name [Format: GFF3]', default= None ) + parser.add_option( '-n', '--methodname', dest='methodName', help='Method name in col. 3 [Default: None]', default= None ) + ( options, args ) = parser.parse_args() + self._options = options + + def checkOptions(self): + if self._options.inputBLAT == '': + raise Exception("ERROR: No Blat file specified for -i !") + elif not os.path.exists(self._options.inputBLAT): + raise Exception("ERROR: Blat Input File doesn't exist !") + else: + self._inputFileBlat = self._options.inputBLAT + + if self._options.inputFASTA == '': + raise Exception("ERROR: No Fasta file specified for -f !") + elif not os.path.exists(self._options.inputFASTA): + raise Exception("ERROR: Fasta Input File doesn't exist !") + else: + self._inputFileFasta = self._options.inputFASTA + + if self._options.output == '': + raise Exception("ERROR: No Output file specified for -o !") + else: + self._outputFileGFF = self._options.output + + self._methodName = self._options.methodName + + def run(self): + self.checkOptions() + self._createGFFOutputFile() + BLATFile = open(self._inputFileBlat, 'r') + + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + blatLine = BLATFile.readline() + numberLine = 6 + while blatLine != '': + lGffLines = [] + + gffLineBes1, besName1, seqBes1, typeBes1 = self.convertBlatObjectToGffLine(blatLine, numberLine) + lGffLines.append(gffLineBes1) + + blatLine = BLATFile.readline() + numberLine = numberLine + 1 + + gffLineBes2, besName2, seqBes2, typeBes2 = self.convertBlatObjectToGffLine(blatLine, numberLine) + lGffLines.append(gffLineBes2) + + gffLineBac = self.createGffLineForBac(gffLineBes1, besName1, seqBes1, typeBes1, gffLineBes2, besName2, seqBes2, typeBes2, numberLine) + lGffLines.append(gffLineBac) + + if gffLineBac != None: + self._printGFFLinesToOutputFile(lGffLines) + + blatLine = BLATFile.readline() + numberLine = numberLine + 1 + + def convertBlatObjectToGffLine(self, blatLine, numberLine): + iBlatHit = BlatParser() + iBlatHit.setAttributesFromString(blatLine, numberLine) + besName = iBlatHit.getQName() + seqBes = self.extractBesSequenceFromFastaFile(besName, numberLine) + + typeBes = '' + if re.match('^.+FM[0-9]$', besName): + typeBes = 'FM' + elif re.match('^.+RM[0-9]$', besName): + typeBes = 'RM' + + col1 = iBlatHit.getTName() + col2 = 'BlatToGffForBesPaired' + if self._methodName == '' or self._methodName == None: + col3 = 'BES' + else: + col3 = '%s:BES' % self._methodName + col4 = iBlatHit.getTStart() + col5 = iBlatHit.getTEnd() + col6 = '.' + col7 = '+' + col8 = '.' + col9 = 'ID=%s;Name=%s;bes_start=%s;bes_end=%s;bes_size=%s;muscadine_seq=%s' % (besName, besName, iBlatHit.getTStart(), iBlatHit.getTEnd(), iBlatHit.getTSize(), seqBes) + + gffLine = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (col1, col2, col3, col4, col5, col6, col7, col8, col9) + return gffLine, iBlatHit.getQName(),seqBes, typeBes + + def createGffLineForBac(self, gffLineBes1, besName1, seqBes1, typeBes1, gffLineBes2, besName2, seqBes2, typeBes2, numberLine): + lGffLineBes1 = gffLineBes1.split('\t') + lGffLineBes2 = gffLineBes2.split('\t') + besName1 = self.getBesName(lGffLineBes1[8]) + besName2 = self.getBesName(lGffLineBes2[8]) + + tBes1 = (lGffLineBes1[0], int(lGffLineBes1[3]), int(lGffLineBes1[4])) + tBes2 = (lGffLineBes2[0], int(lGffLineBes2[3]), int(lGffLineBes2[4])) + + if self.checkBesNames(besName1, besName2, numberLine) == True and self.checkBesPositions(tBes1, tBes2) == True: + startBacPos, endBacPos = self.getBacPositions(tBes1, tBes2) + sizeBacPos = endBacPos - startBacPos + 1 + bacName = self.getBacName(besName1) + nameBesFM, seqBesFM, nameBesRM, seqBesRM = self.getBesFmAndRmNamesAndSequences(besName1, seqBes1, typeBes1, besName2, seqBes2, typeBes2) + + col1 = lGffLineBes1[0] + col2 = 'BlatToGffForBesPaired' + if self._methodName == '' or self._methodName == None: + col3 = 'BAC' + else: + col3 = '%s:BAC' % self._methodName + col4 = startBacPos + col5 = endBacPos + col6 = '.' + col7 = '.' + col8 = '.' + col9 = 'ID=%s;Name=%s;bac_start=%s;bac_end=%s;bac_size=%s;besFM_name=%s;muscadine_besFM_seq=%s;besRM_name=%s;muscadine_besRM_seq=%s' % (bacName, bacName, startBacPos, endBacPos, sizeBacPos, nameBesFM, seqBesFM, nameBesRM, seqBesRM) + gffLine = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (col1, col2, col3, col4, col5, col6, col7, col8, col9) + return gffLine + return None + + def getBesFmAndRmNamesAndSequences(self, besName1, seqBes1, typeBes1, besName2, seqBes2, typeBes2): + if typeBes1 == 'FM' and typeBes2 == 'RM': + return besName1, seqBes1, besName2, seqBes2 + elif typeBes1== 'RM' and typeBes2 == 'FM': + return besName2, seqBes2, besName1, seqBes1 + + def getBesName(self, col9): + lCol9 = col9.split(';') + ID = lCol9[0] + besName = ID[3:] + return besName + + def getBacName(self, besName): + bacName = besName[:-3] + return bacName + + def checkBesNames(self, besName1, besName2, line): + bacName1 = besName1[:-3] + bacName2 = besName2[:-3] + if bacName1 == bacName2: + return True + else: + sys.stderr.write("WARNING: Lines %s and %s the two Bes (%s AND %s) do not belong to the same BAC !!!\n -> you have to filter this Blat file...\n" % (int(line)-1, line, besName1, besName2)) + return False + + def checkBesPositions(self, tBes1, tBes2): + if tBes1[0] == tBes2[0]: + minBes1 = min(tBes1[1], tBes1[2]) + maxBes1 = max(tBes1[1], tBes1[2]) + minBes2 = min(tBes2[1], tBes2[2]) + maxBes2 = max(tBes2[1], tBes2[2]) + if (minBes1 < minBes2 and maxBes1 < minBes2) or (minBes2 < minBes1 and maxBes2 < minBes1): + return True + return False + + def getBacPositions(self, tBes1, tBes2): + startBacPos = 0 + endBacPos = 0 + minBes1 = min(tBes1[1], tBes1[2]) + maxBes1 = max(tBes1[1], tBes1[2]) + minBes2 = min(tBes2[1], tBes2[2]) + maxBes2 = max(tBes2[1], tBes2[2]) + if minBes1 < minBes2: + startBacPos = minBes1 + endBacPos = maxBes2 + else: + startBacPos = minBes2 + endBacPos = maxBes1 + return startBacPos, endBacPos + + def extractBesSequenceFromFastaFile(self, besName, numberLine): + seq = '' + date = datetime.datetime.now() + date = date.strftime("%d%m%Y_%H%M%S") + tmpFileName = 'tmp_BlatToGffForBesPaired_%s.fasta' % date + iFastaUtils = FastaUtils() + iFastaUtils.dbExtractByPattern(besName, self._inputFileFasta, tmpFileName) + + if os.path.exists(tmpFileName): + newFastaFile = open(tmpFileName, 'r') + line = newFastaFile.readline() + if line != '': + while line != '': + if line[0] != '>': + line = line.replace('\n', '') + seq += line + line = newFastaFile.readline() + newFastaFile.close() + os.remove(tmpFileName) + return seq + os.remove(tmpFileName) + + sys.stderr.write("WARNING: At line %s, the BAC-Ends (%s) hasn't got sequence in fasta file (%s) !!\n" % (numberLine, besName, os.path.basename(self._inputFileFasta))) + return 'NA' + + def _createGFFOutputFile(self): + GFFfile = open(self._outputFileGFF, 'w') + GFFfile.write("##gff-version 3\n") + GFFfile.close() + + def _printGFFLinesToOutputFile(self, lLines): + GFFfile = open(self._outputFileGFF, 'a') + for line in lLines: + GFFfile.write(line) + GFFfile.close() + +if __name__ == '__main__': + iBlatToGffForBesPaired = BlatToGffForBesPaired() + iBlatToGffForBesPaired.setAttributesFromCmdLine() + iBlatToGffForBesPaired.run() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/BowtieParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BowtieParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,91 @@ +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Interval import Interval + +class BowtieParser(MapperParser): + """A class that parses BowTie format""" + + def __init__(self, fileName, verbosity = 0): + super(BowtieParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(BowtieParser, self).__del__() + + + def getFileFormats(): + return ["bowtie"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + line = line.strip() + fields = line.split("\t") + if len(fields) not in (7, 8): + raise Exception("Line %d '%s' does not look like a BowTie line (number of fields is %d instead of 7 or 8)" % (self.currentLineNb, line, len(fields))) + name = fields[0] + direction = 1 if fields[1] == "+" else -1 + chromosome = fields[2] + genomeStart = int(fields[3]) + 1 + sequence = fields[4] + quality = fields[5] + number = int(fields[6]) + nbMismatches = 0 + if len(fields) == 8: + tags = fields[7] + nbMismatches = len(tags.split(",")) + + mapping = Mapping() + queryInterval = Interval() + queryInterval.setName(name) + queryInterval.setStart(1) + queryInterval.setEnd(len(sequence) + 1) + targetInterval = Interval() + targetInterval.setChromosome(chromosome) + targetInterval.setStart(genomeStart) + targetInterval.setEnd(genomeStart + len(sequence) - 1) + subMapping = SubMapping() + subMapping.setQueryInterval(queryInterval) + subMapping.setTargetInterval(targetInterval) + mapping.addSubMapping(subMapping) + mapping.setSize(len(sequence)) + mapping.setNbMismatches(nbMismatches) + mapping.setDirection(direction) + return mapping + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/CoordsParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/CoordsParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,137 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.misc import Utils + +class CoordsParser(MapperParser): + """A class that parses the .coords output of Nucmer""" + + def __init__(self, fileName, verbosity = 0): + self._lineParseRe = re.compile(r"^\s*(?P\d+)\s+(?P\d+)\s+\|\s+(?P\d+)\s+(?P\d+)\s+\|\s+(?P\d+)\s+(?P\d+)\s+\|\s+(?P\d+\.?\d*)\s+\|\s+(?P[\w\|\:\-]+)\s+(?P.*)\s*$") + self._lineParseRe2 = re.compile(r"^\s*(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P[-]?\d+\.?\d*)\s+(?P[-]?\d+\.?\d*)\s+(?P[\w\|\:\-]+)\s+(?P.*)\s*$") + self._lineParseRe3 = re.compile(r"^\s*(?P\d+)\s+(?P\d+)\s+\|\s+(?P\d+)\s+(?P\d+)\s+\|\s+(?P\d+)\s+(?P\d+)\s+\|\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+\|\s+(?P[-]?\d+\.?\d*)\s+(?P[-]?\d+\.?\d*)\s+(?P[\w\|\:\-]+)\s+(?P.*)\s*$") + self._lineParseRe4 = re.compile(r"^\s*(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P\d+\.?\d*)\s+(?P[-]?\d+\.?\d*)\s+(?P[-]?\d+\.?\d*)\s+(?P[\w\|\:\-]+)\s+(?P.*)\s*$") + self.lineType = 1 + MapperParser.__init__(self, fileName, verbosity) + + def getFileFormats(): + return ["coords"] + getFileFormats = staticmethod(getFileFormats) + + def skipFirstLines(self): + while True: + line = self.handle.readline() + self.currentLineNb += 1 + if line == "": + break + if "=====" in line: + break + if "[S1]\t[E1]\t[S2]\t[E2]\t[LEN 1]\t[LEN 2]\t[% IDY]\t[LEN R]\t[LEN Q]\t[COV R]\t[COV Q]\t[FRM]\t[TAGS]" in line: + self.lineType = 2 + break + if "[S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] [% SIM] [% STP] | [FRM] [TAGS]" in line: + self.lineType = 3 + + if "[% IDY]\t[% SIM]\t[% STP]" in line and "[LEN Q]"in line: + self.lineType = 4 + break + + def parseLine(self, line): + + if self.lineType == 1 : + m = self._lineParseRe.search(line) + elif self.lineType == 2: + m = self._lineParseRe2.search(line) + elif self.lineType == 3: + m = self._lineParseRe3.search(line) + elif self.lineType == 4: + m = self._lineParseRe4.search(line) + if m == None: + sys.exit("\nLine %d '%s' does not have a NucMer format" % (self.currentLineNb, line)) + + mapping = Mapping() + + subMapping = SubMapping() + subMapping.queryInterval.setName(m.group("qName")) + subMapping.queryInterval.setStart(min(int(m.group("qStart")), int(m.group("qEnd")))) + subMapping.queryInterval.setEnd(max(int(m.group("qStart")), int(m.group("qEnd")))) + subMapping.queryInterval.setSize(int(m.group("qLength"))) + subMapping.queryInterval.setDirection(int(m.group("qEnd")) - int(m.group("qStart"))) + + subMapping.targetInterval.setChromosome(m.group("tName")) + subMapping.targetInterval.setStart(min(int(m.group("tStart")), int(m.group("tEnd")))) + subMapping.targetInterval.setEnd(max(int(m.group("tStart")), int(m.group("tEnd")))) + subMapping.targetInterval.setSize(int(m.group("tLength"))) + subMapping.targetInterval.setDirection(int(m.group("tEnd")) - int(m.group("tStart"))) + + subMapping.setDirection(int(m.group("qEnd")) - int(m.group("qStart"))) + subMapping.setSize(min(int(m.group("qLength")), int(m.group("tLength")))) + subMapping.setIdentity(float(m.group("identity"))) + + mapping.addSubMapping(subMapping) + mapping.targetInterval.setStart(min(int(m.group("tStart")), int(m.group("tEnd")))) + mapping.targetInterval.setEnd(max(int(m.group("tStart")), int(m.group("tEnd")))) + mapping.targetInterval.setSize(int(m.group("tLength"))) + mapping.targetInterval.setChromosome(m.group("tName")) + + mapping.queryInterval.setStart(min(int(m.group("qStart")), int(m.group("qEnd")))) + mapping.queryInterval.setEnd(max(int(m.group("qStart")), int(m.group("qEnd")))) + mapping.queryInterval.setSize(int(m.group("qLength"))) + mapping.queryInterval.setName(m.group("qName")) + mapping.setDirection(int(m.group("qEnd")) - int(m.group("qStart"))) + mapping.setSize(min(int(m.group("qLength")), int(m.group("tLength")))) + mapping.setIdentity(float(m.group("identity"))) + mapping.setTagValue("feature", "match") + mapping.setTagValue("Target", "%s %d %d" % (m.group("qName"), int(m.group("qStart")), int(m.group("qEnd")))) + + if self.lineType ==2 or self.lineType ==4: + mapping.setTagValue("target_pident", float(m.group("identity"))) + mapping.setTagValue("target_pcover", float(m.group("qcov"))) + mapping.setTagValue("target_length", int(m.group("qlen"))) + + +# Specific to Mark Work. Commented lines because of possible slowdown. +# for line in self.handle: +# string1 = line.strip() +# self.currentLineNb += 1 +# break +# for line in self.handle: +# string2 = line.strip() +# self.currentLineNb += 1 +# break +# print(len(string1),len(string2)) +# mapping.setNbMismatches(Utils.getHammingDistance(string1, string2)) + mapping.setNbGaps(0) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,197 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import optparse +from commons.core.parsing.SsrParser import SsrParser +from commons.core.parsing.BlatParser import BlatParser + +class CrossSsrAndBesMappedByBlatToGff(object): + + + def __init__(self): + self._inputFileSSR = '' + self._inputFileBlat = '' + self._outputFileGFF = '' + + def setAttributesFromCmdLine(self): + help = '\ + \nThis Script Launch CrossSsrAndBesMappedByBlatToGff.\n\n\ + Example 1: python CrossSsrAndBesMappedByBlatToGff.py -s ssrResultsFile.tab -b blatResultsFile.tab -o outputFile.gff3\n\ + Example 2: python CrossSsrAndBesMappedByBlatToGff.py -s ssrResultsFile.tab -b blatResultsFile.tab -o outputFile.gff3 -n muscadine:filtre1\n\n' + + parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0") + parser.add_option( '-s', '--ssr', dest='inputSSR', help='SSR Input File Name [Format: tabular]', default= None ) + parser.add_option( '-b', '--blat', dest='inputBLAT', help='Blat Input File Name [Format: tabular]', default= None ) + parser.add_option( '-o', '--output', dest='output', help='Output File Name [Format: GFF3]', default= None ) + parser.add_option( '-n', '--methodName', dest='methodName', help='Method name in col. 3 [Default: None]', default= None ) + ( options, args ) = parser.parse_args() + self.options = options + + def checkOptions(self): + if self.options.inputSSR == '': + raise Exception("ERROR: No SSR file specified for -s !") + elif not os.path.exists(self.options.inputSSR): + raise Exception("ERROR: SSR Input File doesn't exist !") + else: + self._inputFileSSR = self.options.inputSSR + + if self.options.inputBLAT == '': + raise Exception("ERROR: No Blat file specified for -b !") + elif not os.path.exists(self.options.inputBLAT): + raise Exception("ERROR: Blat Input File doesn't exist !") + else: + self._inputFileBlat = self.options.inputBLAT + + if self.options.output == '': + raise Exception("ERROR: No Output file specified for -o !") + else: + self._outputFileGFF = self.options.output + + self._methodName = self.options.methodName + + def run(self): + self.checkOptions() + self._createGFFOutputFile() + + dictSsrParser = {} + dictSsrParser = self.createDictOfSsrParser(dictSsrParser) + + BLATFile = open(self._inputFileBlat, 'r') + + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + blatLine = BLATFile.readline() + numberLine = 6 + while blatLine != '' and blatLine != '\n': + thisBlatHit = BlatParser() + thisBlatHit.setAttributesFromString(blatLine, numberLine) + besName = thisBlatHit.getQName() + + if besName in dictSsrParser: + lLinesToPrint = self.createListOfGFFLinesForThisBesWithSSR(thisBlatHit, dictSsrParser) + self._printGFFLinesToOutputFile(lLinesToPrint) + + blatLine = BLATFile.readline() + numberLine = numberLine + 1 + + BLATFile.close() + + def createDictOfSsrParser(self, dictSsrParser): + dictSsrParser = {} + SSRFile = open(self._inputFileSSR, 'r') + + header = SSRFile.readline() + line = SSRFile.readline() + numberLine = 2 + + while line != '' and line != '\n': + thisSSRHit = SsrParser() + thisSSRHit.setAttributesFromString(line, numberLine) + + BESName = thisSSRHit.getBesName() + if not BESName in dictSsrParser: + list = [thisSSRHit] + dictSsrParser[BESName] = list + else: + list = dictSsrParser[BESName] + list.append(thisSSRHit) + dictSsrParser[BESName] = list + + line = SSRFile.readline() + numberLine = numberLine + 1 + + SSRFile.close() + return dictSsrParser + + def createListOfGFFLinesForThisBesWithSSR(self, BlatHitObject, dictSsrParser): + listGffLines = [] + + besNameToKeep = BlatHitObject.getQName() + lOfSSRHitObject = dictSsrParser[besNameToKeep] + + for SSRHitObject in lOfSSRHitObject: + posSSRStart = self.convertSSRPositionsToChromPositions(SSRHitObject.getSsrStart(), BlatHitObject.getTStart(), BlatHitObject.getTEnd(), BlatHitObject.getStrand()) + posSSREnd = self.convertSSRPositionsToChromPositions(SSRHitObject.getSsrEnd(), BlatHitObject.getTStart(), BlatHitObject.getTEnd(), BlatHitObject.getStrand()) + ssrSeq = self.getSsrSeq(SSRHitObject.getSsrMotif(), SSRHitObject.getSsrMotifNumber()) + + col1 = BlatHitObject.getTName() + col2 = 'CrossSsrAndBesAlignedByBlat' + if self._methodName != '' and self._methodName != None: + col3 = '%s:SSR' %self._methodName + else: + col3 = 'SSR' + col4 = posSSRStart + col5 = posSSREnd + col6 = '.' + col7 = BlatHitObject.getStrand() + col8 = '.' + col9 = 'ID=SSR_%s_%s;Name=SSR_%s_%s;bes_name=%s;bes_size=%s;bes_matchstart=%s;bes_matchend=%s;bes_redundancy=%s;ssr_type=%s;ssr_motif=%s;ssr_motif_number=%s;ssr_start=%s;ssr_end=%s;muscadine_seq=%s' % (besNameToKeep, SSRHitObject.getBesRedundancy(), + besNameToKeep, SSRHitObject.getBesRedundancy(), + besNameToKeep, BlatHitObject.getQSize(), + BlatHitObject.getQStart(), BlatHitObject.getQEnd(), + SSRHitObject.getBesRedundancy(), SSRHitObject.getSsrNbNucleotides(), + SSRHitObject.getSsrMotif(), SSRHitObject.getSsrMotifNumber(), + SSRHitObject.getSsrStart(), SSRHitObject.getSsrEnd(), ssrSeq) + gffLine = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (col1, col2, col3, col4, col5, col6, col7, col8, col9) + listGffLines.append(gffLine) + + return listGffLines + + def convertSSRPositionsToChromPositions(self, ssrPos, chromPosStart, chromPosEnd, strand): + if strand == '+': + newPos = int(chromPosStart) + int(ssrPos) - 1 + elif strand == '-': + newPos = int(chromPosEnd) - int(ssrPos) + 1 + return newPos + + def getSsrSeq(self, motif, nbMotif): + ssrSeq = motif * int(nbMotif) + return ssrSeq + + def _createGFFOutputFile(self): + GFFfile = open(self._outputFileGFF, 'w') + GFFfile.write("##gff-version 3\n") + GFFfile.close() + + def _printGFFLinesToOutputFile(self, lLinesToPrint): + GFFfile = open(self._outputFileGFF, 'a') + for line in lLinesToPrint: + GFFfile.write(line) + GFFfile.close() + +if __name__ == '__main__': + iCrossSsrAndBesMappedByBlatToGff = CrossSsrAndBesMappedByBlatToGff() + iCrossSsrAndBesMappedByBlatToGff.setAttributesFromCmdLine() + iCrossSsrAndBesMappedByBlatToGff.run() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/ElandParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ElandParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,126 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure import Mapping + +class ElandParser(MapperParser): + """A class that parses ELAND format""" + + def __init__(self, fileName, verbosity = 0): + super(ElandParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(ElandParser, self).__del__() + + + def getFileFormats(): + return ["eland"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + super(ElandParser, self).getInfos() + + + def parseLine(self, line): + + line = line.strip() + + fields = line.split("\t") + if len(fields) < 22: + sys.exit("Line %d '%s' does not look like a ELAND line (number of fields is %d instead of 22)" % (self.currentLineNb, line, len(fields))) + + flowCell = fields[0] + run = fields[1] + lane = fields[2] + tile = fields[3] + xcoord = fields[4] + ycoord = fields[5] + index = fields[6] + number = fields[7] + read = fields[8] + quality = fields[9] + chromosome = fields[10] + contig = fields[11] + position = fields[12] + strand = fields[13] + description = fields[14] + singleScore = fields[15] + pairScore = fields[16] + partnerChromosome = fields[17] + partnerContig = fields[18] + partnerOffset = fields[19] + partnerStrand = fields[20] + filtering = fields[21] + + if number != "1": + sys.exit("S-MART cannot handle pair-end reads yet!") + + # nothing found + if position == "": + return None + + name = "%s_%s:%s:%s:%s:%s#0/1" % (flowCell, run, lane, tile, xcoord, ycoord) + direction = 1 if strand == "F" else -1 + nbMismatches = 0 + for char in description: + if ord("A") <= ord(char) and ord(char) <= ord("Z"): + nbMismatches += 1 + + mapping = Mapping() + mapping.setTagValue("qualityString", quality) + + mapping.queryInterval.setName(name) + mapping.queryInterval.setDirection(direction) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setEnd(len(read)) + + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(int(position)) + mapping.targetInterval.setEnd(int(position) + len(read)) + mapping.targetInterval.setDirection(1) + + mapping.setSize(len(read)) + mapping.setDirection(direction) + + mapping.setNbGaps(0) + mapping.setNbMismatches(nbMismatches) + mapping.setTagValue("score", int(singleScore)) + + if filtering == "Y": + return mapping + # mapping filtered out + return None diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/ExoParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ExoParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,137 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping + +class ExoParser(MapperParser): + """A class that parses the output of Exonerate - roll your own format""" + + def __init__(self, fileName, verbosity = 0): + super(ExoParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(ExoParser, self).__del__() + + + def getFileFormats(): + return ["exo", "exonerate"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + while "Hostname" not in self.handle.readline(): + self.currentLineNb += 1 + pass + + + def parseLine(self, line): + + if line == "-- completed exonerate analysis\n": + return None + + m = re.search(r"^\s*(\S+)\s+(\d+)\s+(\d+)\s+[+-]\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s+(\d+)\s+(\S.*)$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a RYO format" % (self.currentLineNb, line)) + + mapping = Mapping() + name = m.group(1) + queryStart = min(int(m.group(2)), int(m.group(3))) + queryEnd = max(int(m.group(2)), int(m.group(3)))-1 + chromosome = m.group(4) + targetStart = min(int(m.group(5)), int(m.group(6))) + targetEnd = max(int(m.group(5)), int(m.group(6)))-1 + direction = m.group(7) + nbMismatches = int(m.group(8)) + rest = m.group(9).strip() + + nbGaps = 0 + queryOffset = 0 + targetOffset = 0 + + subMapping = None + m = re.search(r"^(\w)\s+(\d+)\s+(\d+)", rest) + while m != None: + queryDistance = int(m.group(2)) + targetDistance = int(m.group(3)) + if m.group(1) == "M": + if subMapping == None: + subMapping = SubMapping() + + subMapping.setSize(queryDistance) + subMapping.setDirection(direction) + + subMapping.queryInterval.setName(name) + subMapping.queryInterval.setStart(queryStart + queryOffset) + subMapping.queryInterval.setDirection(direction) + + subMapping.targetInterval.setChromosome(chromosome) + subMapping.targetInterval.setStart(targetStart + targetOffset) + subMapping.targetInterval.setDirection(1) + + elif m.group(1) == "G": + nbGaps += max(queryDistance, targetDistance) + + elif m.group(1) == "I" or m.group(1) == "5" or m.group(1) == "3": + if subMapping != None: + subMapping.queryInterval.setEnd(queryStart + queryOffset - 1) + subMapping.targetInterval.setEnd(targetStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + subMapping = None + else: + sys.exit("Cannot understand sign '%s' in line %s" % (m.group(1), line)) + + queryOffset += queryDistance + targetOffset += targetDistance + rest = rest[m.end():].strip() + m = re.search(r"^(\w)\s+(\d+)\s+(\d+)", rest) + + if subMapping != None: + subMapping.queryInterval.setEnd(queryStart + queryOffset - 1) + subMapping.targetInterval.setEnd(targetStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(nbGaps) + mapping.setDirection(direction) + + mapping.queryInterval.setName(name) + mapping.queryInterval.setStart(queryStart) + mapping.queryInterval.setEnd(queryEnd) + + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(targetStart) + mapping.targetInterval.setEnd(targetEnd) + + return mapping + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/FastaParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FastaParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,173 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.SequenceListParser import SequenceListParser +from SMART.Java.Python.structure.Sequence import Sequence +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class FastaParser(SequenceListParser): + """A class that reads a list of sequences in FASTA""" + + def __init__(self, fileName, verbosity = 0): + super(FastaParser, self).__init__(fileName, verbosity) + self.tags = {} + + + def getTags(self): + return self.tags + + + def getFileFormats(): + return ["fasta", "mfa", "fas"] + getFileFormats = staticmethod(getFileFormats) + + + def getInfos(self): + """ + Get some generic information about the sequences + """ + self.nbSequences = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity - 9) + for line in self.handle: + line = line.strip() + if line == "": + continue + if line[0] == ">": + self.nbSequences += 1 + else: + self.size += len(line) + progress.inc() + progress.done() + self.reset() + + + def parseOne(self): + """ + Parse only one element in the file + """ + name = None + string = "" + + if self.currentLine != None: + if self.currentLine[0] != ">": + raise Exception("First line is weird: %s" % (self.currentLine)) + name = self.currentLine[1:].split()[0].replace("|", "_").replace(".", "_") + self.currentLine = None + + for line in self.handle: + line = line.strip() + if line == "": + pass + elif line[0] == ">": + if name == None: + name = line[1:].split()[0].replace("|", "_").replace(".", "_") + else: + self.currentLine = line + return Sequence(name, string) + else: + string += line + + if name == None: + return None + return Sequence(name, string) + + + def setTags(self): + mark = self.handle.tell() + thisTag = mark + + line = self.handle.readline() + while line != "": + if line[0] == ">": + line = line.strip() + self.tags[line[1:].split()[0]] = thisTag + thisTag = self.handle.tell() + line = self.handle.readline() + + self.handle.seek(mark) + + + def getSubSequence(self, chromosome, start, end, direction, name = None): + if not self.tags: + self.setTags() + + if chromosome not in self.tags: + raise Exception("Cannot find " + chromosome) + + if name == None: + name = "%s:%d-%d (%d)" % (chromosome, start, end, direction) + sequence = Sequence(name) + + # switch from 0-based to 1-based coordinates + start -= 1 + end -= 1 + + self.handle.seek(self.tags[chromosome]) + line = self.handle.readline().strip() + if line != ">" + chromosome: + raise Exception("Arrived in a wrong place (got %s)" % (line)) + + position1 = self.handle.tell() + line = self.handle.readline().strip() + position2 = self.handle.tell() + size = len(line) + address = position1 + ((start - (start % size)) / size) * (position2 - position1); + + count = max(0, start - (start % size)); + self.handle.seek(address) + + newSequence = "" + for line in self.handle: + line = line.strip() + + if line[0] == ">": + break + + subStart = start - count + if subStart < 0: + subStart = 0 + subEnd = end - count + subSize = subEnd - subStart + 1 + if subSize + subStart > len(line): + subSize = len(line) - subStart + if subEnd < 0: + break + if subStart <= len(line): + newSequence += line[subStart:subStart+subSize] + count += len(line) + + if newSequence == "": + raise Exception("Error, sequence %s is empty" % (name)) + sequence.sequence = newSequence + if direction == -1: + sequence.reverseComplement() + return sequence diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/FastqParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FastqParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,104 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.SequenceListParser import SequenceListParser +from SMART.Java.Python.structure.Sequence import Sequence + +class FastqParser(SequenceListParser): + """A class that reads a list of sequences in FASTQ format""" + + def __init__(self, fileName, verbosity = 0): + super(FastqParser, self).__init__(fileName, verbosity) + + + def getFileFormats(): + return ["fastq", "mfq"] + getFileFormats = staticmethod(getFileFormats) + + + def getInfos(self): + """ + Get some generic information about the sequences + """ + self.nbSequences = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + + nbLines = 0 + for line in self.handle: + line = line.strip() + if line == "": + continue + nbLines += 1 + if self.verbosity >= 10 and nbLines % 400000 == 0: + sys.stdout.write(" %d sequences read\r" % (nbLines / 4)) + sys.stdout.flush() + self.reset() + self.nbSequences = nbLines / 4 + if self.verbosity >= 10: + print " %d sequences read" % (self.nbSequences) + print "Done." + + + def parseOne(self): + """ + Parse only one element in the file + """ + string = "" + quality = "" + lineType = 0 + + for line in self.handle: + line = line.strip() + if lineType == 0: + if line[0] != "@": + raise Exception("Line '%s' should start with '@'!" % (line)) + name = line[1:] + inSequence = True + inQuality = False + elif lineType == 1: + string = line + elif lineType == 2: + if line[0] != "+": + sys.exit("Line '%s' should start with '+'!" % (line)) + if line[1:] != name and line != "+": + sys.exit("Weird difference in sequence and quality names (%s and %s) while parsing FASTQ file %s." % (name, line[1:], self.fileName)) + inQuality = True + inSequence = False + elif lineType == 3: + quality = line + lineType += 1 + if lineType == 4: + sequence = Sequence(name, string) + sequence.setQuality(quality) + return sequence + + return None diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/FindRep.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FindRep.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,113 @@ +import re +from xml.sax.handler import ContentHandler + +class FindRep( ContentHandler ): + def __init__(self,outfileName, filter=0,count=0): + self.inWindowContent = 0 + self.inSeqNameContent = 0 + self.inStartContent = 0 + self.inEndContent = 0 + self.inPeriodContent = 0 + self.inUnitContent = 0 + self.inScoreContent = 0 + self.count = count + self._outfileName = outfileName + self.filter=filter + + def startDocument(self): + self._fileout = open(self._outfileName,"w") + + def startElement(self,name,attrs): + if name=="window": + self.inWindowContent=1 + elif name=="sequence-name": + self.inSeqNameContent=1 + self.seqname="" + elif name=="repeat": + self.inRepContent=1 + self.start="" + self.end="" + self.period="" + self.type={} + elif name=="start": + self.inStartContent=1 + elif name=="end": + self.inEndContent=1 + elif name=="period": + self.inPeriodContent=1 + elif name=="unit": + self.inUnitContent=1 + self.unit="" + elif name=="score": + self.inScoreContent=1 + self.score="" + + def characters(self,ch): + if self.inSeqNameContent: + self.seqname+=ch + elif self.inStartContent: + self.start+=ch + elif self.inEndContent: + self.end+=ch + elif self.inPeriodContent: + self.period+=ch + elif self.inUnitContent: + self.unit+=ch + elif self.inScoreContent: + self.score+=ch + + def endElement(self,name): + if name=="window": + self.inWindowContent=0 + elif name=="sequence-name": + self.inSeqNameContent=0 + elif name=="repeat": + self.inRepContent=0 + start=int(self.start) + end=int(self.end) + period=int(self.period) + score=float(self.score) + if score>self.filter: + return + max = 0 + self.count+=1 + for k,n in self.type.items(): + if n>max: + max = n + k_max = k + + m=re.match("^[0-9]+.+\{Cut\}",self.seqname) + if m!=None: + seqname=self.seqname[m.start(0):m.end(0)-5].rstrip() + seqname=re.sub("^[0-9]+ ","",seqname).lstrip() + tok=self.seqname[m.end(0):].split("..") + astart=start+int(tok[0])-1 + aend=end+int(tok[0])-1 + else: + astart=start + aend=end + seqname=self.seqname + if len(k_max) > 100: + k_max=k_max[:48]+"..."+k_max[-51:] + strout="%d\t(%s)%d\t%s\t%d\t%d"%\ + (self.count,k_max,(abs(start-end)+1)/period,\ + seqname,astart,aend) + self._fileout.write("%s\n"%(strout)) + + elif name=="start": + self.inStartContent=0 + elif name=="end": + self.inEndContent=0 + elif name=="period": + self.inPeriodContent=0 + elif name=="score": + self.inScoreContent=0 + elif name=="unit": + self.inUnitContent=0 + if self.type.has_key(self.unit): + self.type[self.unit]+=1 + else: + self.type[self.unit]=1 + + def endDocument(self): + self._fileout.close() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/GbParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/GbParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,111 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GbParser(TranscriptListParser): + """A class that parses a GBrowse file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + self.reference = None + self.color = None + super(GbParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GbParser, self).__del__() + + + def getFileFormats(): + return ["gb", "gbrowse"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + for line in self.handle: + self.currentLineNb += 1 + line = line.strip() + m = re.search(r"^\s*bgcolor\s*=\s*(\S+)\s*$", line) + if m != None: + self.color = m.group(1) + if line == "": + return + + + def parseLine(self, line): + transcript = Transcript() + # first line (reference) + m = re.search(r"^\s*reference\s*=\s*(\S+)\s*$", line) + if m != None: + self.reference = m.group(1) + for line in self.handle: + line = line.strip() + self.currentLineNb += 1 + break + # second line (genomic coordinates) + m = re.search(r"^\s*READS\s+(\S+)\s+(\S+)\s+\"([^\"]*)\"\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a GBrowse format" % (self.currentLineNb, line)) + if self.reference == None: + sys.exit("Cannot get reference of GBrowse line %d '%s'" % (self.currentLineNb, line)) + transcript.setChromosome(self.reference) + transcript.setName(m.group(1)) + transcript.setComment(m.group(3)) + # exons + exons = m.group(2).split(",") + transcriptStart = 1000000000 + transcriptEnd = 0 + direction = 0 + for exon in exons: + m = re.search(r"^(\d+)-(\d+)$", exon) + if m == None: + sys.exit("\nCannot read GBrowse exon line %d '%s'" % (self.currentLineNb, exon)) + interval = Interval() + interval.setChromosome(transcript.chromosome) + direction += int(m.group(2)) - int(m.group(1)) + start = min(int(m.group(1)), int(m.group(2))) + end = max(int(m.group(1)), int(m.group(2))) + interval.setStart(start) + interval.setEnd(end) + transcriptStart = min(transcriptStart, start) + transcriptEnd = max(transcriptEnd, end) + transcript.addExon(interval) + transcript.setStart(transcriptStart) + transcript.setEnd(transcriptEnd) + transcript.setDirection(direction) + for exon in transcript.getExons(): + exon.setDirection(direction) + return transcript + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/GffParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/GffParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,149 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GffParser(TranscriptListParser): + """A class that parses a GFF file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + super(GffParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GffParser, self).__del__() + + + def getFileFormats(): + return ["gff", "gff2", "gff3"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + self.chromosomes = set() + self.nbTranscripts = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + self.reset() + for line in self.handle: + line = line.strip() + if line == "" or line[0] == "#": + continue + parts = line.split("\t") + if len(parts) != 9: + raise Exception("Error! Line '%s' has %d tab-separated fields instead of 9!" % (line, len(parts))) + self.chromosomes.add(parts[0]) + if parts[8].find("Parent") == -1: + self.nbTranscripts += 1 + else: + self.size += max(int(parts[3]), int(parts[4])) - min(int(parts[3]), int(parts[4])) + 1 + if self.verbosity >= 10 and self.nbTranscripts % 100000 == 0: + sys.stdout.write(" %d transcripts read\r" % (self.nbTranscripts)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d transcripts read" % (self.nbTranscripts) + print "Done." + + + def parseLine(self, line): + if not line or line[0] == "#": + return None + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-.])\s+(\S+)\s+(\S.*)$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a GFF format\n" % (self.currentLineNb, line)) + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + interval.setTagValue("feature", m.group(3)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + posSpace = remaining.find(" ") + posEqual = remaining.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + parts = remaining.split("=") + else: + parts = remaining.split() + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field in ("Name", "name", "Sequence", "TE", "SAT"): + interval.setName(value) + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if "Parent" in interval.getTagNames(): + if self.currentTranscript == None: + raise Exception("GFF file does not start with a transcript! First line is '%s'." % (line.strip())) + if interval.getTagValue("Parent") != self.currentTranscript.getTagValue("ID"): + raise Exception("Exon '%s' is not right after its transcript in GFF file!" % (interval)) + self.currentTranscript.addExon(interval) + if interval.name == None: + interval.name = self.currentTranscript.name + return None + + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.previousTranscriptAddress = self.currentAddress + + if transcript != None and transcript.name.startswith("unnamed"): + if "ID" in transcript.getTagNames(): + transcript.name = transcript.getTagValue("ID") + else: + transcript.name = "unnamed transcript %s" % (self.currentLineNb) + return transcript diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/GtfParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/GtfParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,113 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GtfParser(TranscriptListParser): + """A class that parses a GTF file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + super(GtfParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GtfParser, self).__del__() + + + def getFileFormats(): + return ["gtf", "gtf2"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + if line[0] == "#": + return None + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-.])\s+(\S+)\s+(\S.*)$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a GTF format\n" % (self.currentLineNb, line)) + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + type = m.group(3) + + if type not in ("transcript", "exon"): + return None + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + parts = remaining.split(" ", 1) + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field == "transcript_id": + interval.setTagValue("ID", value) + elif field == "gene_name": + interval.setName(value) + elif field == "transcript_name": + interval.setName(value) + elif field == "exon_number": + continue + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if self.currentTranscript == None or interval.getTagValue("ID") != self.currentTranscript.getTagValue("ID"): + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.currentTranscript.setTagValue("feature", "transcript") + self.previousTranscriptAddress = self.currentAddress + return transcript + if type == "exon": + self.currentTranscript.addExon(interval) + return None diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/MapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MapParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,67 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class MapParser(TranscriptListParser): + """A class that parses the repet .map files""" + + def __init__(self, fileName, verbosity = 0): + self._lineParseRe = re.compile(r"(?P\w+)\s(?P\w+)\s(?P\d+)\s(?P\d+)") + TranscriptListParser.__init__(self, fileName, verbosity) + + def getFileFormats(): + return ["map"] + getFileFormats = staticmethod(getFileFormats) + + def skipFirstLines(self): + return + + def parseLine(self, line): + m = self._lineParseRe.search(line) + + if m == None: + sys.exit("\nLine %d '%s' does not have a map format" % (self.currentLineNb, line)) + + transcript = Transcript() + transcript.setChromosome(m.group("chrName")) + transcript.setStart(min(int(m.group("sStart")), int(m.group("sEnd")))) + transcript.setEnd(max(int(m.group("sStart")), int(m.group("sEnd")))) + transcript.setName(m.group("seqName")) + transcript.setDirection(1) + + return transcript diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/MapperParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MapperParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,129 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.Mapping import Mapping + + +class MapperParser(object): + """An interface that parses the output of a generic mapper""" + + def __init__(self, fileName, verbosity = 0): + super(MapperParser, self).__init__() + self.verbosity = verbosity + self.nbMappings = None + self.chromosomes = None + self.size = None + self.currentMapping = Mapping() + self.handle = open(fileName) + self.currentLineNb = 0 + self.skipFirstLines() + self.fileName = fileName + self.startingPoint = self.handle.tell() + + + def __del__(self): + self.handle.close() + + + def reset(self): + self.handle.seek(self.startingPoint) + self.currentLineNb = 0 + + + def getNextMapping(self): + for line in self.handle: + mapping = self.parseLine(line) + self.currentLineNb += 1 + if mapping != None: + return mapping + return False + + + def getIterator(self): + self.reset() + mapping = self.getNextMapping() + while mapping: + yield mapping + mapping = self.getNextMapping() + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information." + for mapping in self.getIterator(): + transcript = mapping.getTranscript() + self.chromosomes.add(transcript.getChromosome()) + self.nbMappings += 1 + self.size += transcript.getSize() + if self.verbosity >= 10 and self.nbMappings % 100000 == 0: + sys.stdout.write(" %d mappings read\r" % (self.nbMappings)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d mappings read" % (self.nbMappings) + print "Done." + + + def getNbMappings(self): + if self.nbMappings != None: + return self.nbMappings + self.getInfos() + return self.nbMappings + + + def getNbItems(self): + return self.getNbMappings() + + + def getChromosomes(self): + if self.chromosomes != None: + return self.chromosomes + self.getInfos() + return self.chromosomes + + + def getSize(self): + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getNbNucleotides(self): + return self.getSize() + + + def setDefaultTagValue(self, name, value): + for mapping in self.getIterator(): + mapping.setTagValue(name, value) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/MaqParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MaqParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,77 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser + + +class MaqParser(MapperParser): + """A class that parses the output of Maq""" + + def __init__(self, fileName, verbosity = 0): + super(MaqParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(MaqParser, self).__del__() + + + def getFileFormats(): + return ["maq"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a MAQ format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.targetInterval.setStart(int(m.group(3))) + mapping.targetInterval.setSize(int(m.group(14))) + mapping.targetInterval.setChromosome(m.group(2)) + + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(int(m.group(14))) + mapping.queryInterval.setName(m.group(1)) + + mapping.setDirection(m.group(4)) + mapping.setSize(int(m.group(14))) + mapping.setNbMismatches(int(m.group(10))) + mapping.setRank(1) + mapping.setNbOccurrences(int(m.group(12)) + int(m.group(13))) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/MrepsToSet.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MrepsToSet.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,31 @@ +from commons.core.parsing.FindRep import FindRep +from xml.sax import make_parser +from xml.sax.handler import feature_namespaces +import os + + +class MrepsToSet(object): + + def __init__(self, mrepsInputFileName="", mrepsOuputFileName="", outputFileName=None, errorFilter=0): + self._mrepsInputFileName = mrepsInputFileName + self._mrepsOuputFileName = mrepsOuputFileName + self._outputFileName = outputFileName or "%s.Mreps.set" % mrepsOuputFileName + self._errorFilter = errorFilter + + def run(self): + xmlParser = make_parser() + xmlParser.setFeature( feature_namespaces, 0 ) + xmlParser.setContentHandler( FindRep( self._outputFileName, self._errorFilter, 0 ) ) + xmlParser.parse( self._mrepsOuputFileName ) + + def clean( self ): + """ + Remove the output file (xml) from Mreps to keep only the 'set' file. + """ + if os.path.exists(self._mrepsOuputFileName): + os.remove(self._mrepsOuputFileName) + + + + + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/Multifasta2SNPFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/Multifasta2SNPFile.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,846 @@ +import re +import os +import logging +from commons.core.utils.FileUtils import FileUtils +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.Bioseq import Bioseq +from commons.core.LoggerFactory import LoggerFactory + +DNA_ALPHABET_WITH_N_AND_DELS = set (['A','T','G','C','N','-']) +IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N', '-', 'a','t','g','c','u','r','y','m','k','w','s','b','d','h','v','n']) + +class Multifasta2SNPFile( object ): + + POLYM_TYPE_4_SNP = "SNP" + POLYM_TYPE_4_INSERTION = "INSERTION" + POLYM_TYPE_4_DELETION = "DELETION" + POLYM_DEFAULT_CONFIDENCE_VALUE = "A" + SNP_LENGTH = 1 + FLANK_LENGTH = 250 + + def __init__(self, taxon, batchName="", geneName=""): + + if(batchName): + self._batchName = batchName + + if(geneName): + self._geneName = geneName + + self._taxon = taxon + self._outSubSNPFileName = "SubSNP.csv" + self._outAlleleFileName = "Allele.csv" + self._outIndividualFileName = "Individual.csv" + self._outSequenceFSAFileName = "Sequences.fsa" + self._outSequenceCSVFileName = "Sequences.csv" + self._outBatchFileName = "Batch.txt" + self._outBatchLineFileName = "BatchLine.csv" + self._logFileName = "multifasta2SNP.log" + + self._lBatchFileResults = [] + self._lSubSNPFileResults = [] + self._lRefSequences = [] + self._lIndividualFileResults = [] + self._lBatchLineFileResults = [] + self._dIndividualNumbers4SubSNPResults = {} + self._dAlleleFileResults = {} + + + self.dcurrentIndel = {} + self.lIndelsOfTheCurrentLine = [] + self.lIndelsOverAllLines = [] + self.dSNPsPositions = {} + + self._iCurrentLineNumber = 0 + self._currentBatchNumber = 1 + self.currentLineName = "" + self.currentNucleotide = "" + self.currentPosition = 0 + self._sPolymConfidenceValue = Multifasta2SNPFile.POLYM_DEFAULT_CONFIDENCE_VALUE + self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_SNP + self._iPolymLength = Multifasta2SNPFile.SNP_LENGTH + self._fileUtils = FileUtils() + + if self._fileUtils.isRessourceExists(self._logFileName): + os.remove(self._logFileName) + self._logFile = LoggerFactory.createLogger(self._logFileName, logging.INFO, "%(asctime)s %(levelname)s: %(message)s") + + def runOneBatch( self, inFileName): + self._currentFileName = inFileName + #TODO: methode a virer; n'utiliser au final que runOneBatchWithoutWriting + self._wrapper = self.createWrapperFromFile(inFileName) + self._lBatchFileResults = self.completeBatchList() + self.detectSNPsAndIndels(self._wrapper) + self._writeAllOutputFiles() + self._currentBatchNumber += 1 + + def runOneBatchWithoutWriting( self, inFileName): + self.lIndelsOverAllLines = [] + self._currentFileName = inFileName + self._wrapper = self.createWrapperFromFile(inFileName) + self._lBatchFileResults = self.completeBatchList() + self.detectSNPsAndIndels(self._wrapper) + self._currentBatchNumber += 1 + + + def _cleanOutputsInTheCurrentDir(self): + #TODO: create a list of files to be deleted + FileUtils.removeFilesByPattern("*.csv") + if (FileUtils.isRessourceExists(self._outBatchFileName)): + os.remove(self._outBatchFileName) + if (FileUtils.isRessourceExists(self._outSequenceFSAFileName)): + os.remove(self._outSequenceFSAFileName) + + + def _createOutputObjectsIteratingOnCurrentDir(self): + #TODO: gerer les extensions multiples + extList = [".fasta", ".fsa"] + for dirname, dirnames, filenames in os.walk("."): + filenames.sort() + for filename in filenames: + if os.path.splitext(filename)[1] in extList: + self._geneName = os.path.splitext(filename)[0] + self._batchName = "Batch_" + self._geneName + self.runOneBatchWithoutWriting(filename) + + def runSeveralBatches( self, inputDir): + #TODO: enlever les chdirs, appeler les fichiers en absolu et modifier les tests en consequences + os.chdir(inputDir) + self._cleanOutputsInTheCurrentDir() + self._createOutputObjectsIteratingOnCurrentDir() + self._writeAllOutputFiles() + os.chdir("../") + + + def _treatADeletionClosingWithAnotherBaseThanRefSeq(self, lineName, nucleotide, position): + if (self.isTheIndelOpen4ThisLine): + self._closeTheCurrentIndel(lineName, nucleotide, position) + self._manageSNPs(lineName, nucleotide, position) + self.addOnePolymorphicPosition(position) + + def _treatNucleotideDifferentThanRefSeqCase(self, refSeq, lineName, index, nucleotide, position): + if (nucleotide == "-" or refSeq[index] == "-"): + if (self.isTheIndelOpen4ThisLine): + self._expandTheCurrentIndel(position, nucleotide) + else: + self._startAnIndel(position, nucleotide) + else: + self._treatADeletionClosingWithAnotherBaseThanRefSeq(lineName, nucleotide, position) + + + def _treatSameNucleotideInOneIndel(self, refSeq, lineName, index, nucleotide, position): + if (self._sPolymType == Multifasta2SNPFile.POLYM_TYPE_4_DELETION): + self._closeTheCurrentIndel(lineName, nucleotide, position) + elif (self._sPolymType == Multifasta2SNPFile.POLYM_TYPE_4_INSERTION): + if (refSeq[index] == "-"): + self._expandTheCurrentIndel(position, nucleotide) + else: + self._closeTheCurrentIndel(lineName, nucleotide, position) + + def detectSNPsAndIndels(self, iRefAndLines): + refSeq = iRefAndLines.getReferenceSequence() + refSeqLength = len ( refSeq ) + self.dSNPsPositions = {} + + for iLineBioseq in iRefAndLines.getLinesBioseqInstances(): + lineSequence = iLineBioseq.sequence + self.currentLineName = iLineBioseq.header + self._manageCurrentIndividual(self.currentLineName) + + index = 0 + self.isTheIndelOpen4ThisLine = 0 + self.lIndelsOfTheCurrentLine = [] + for nucleotide in lineSequence: + position = index + 1 + if (index < refSeqLength) and self._isSNPDetected(refSeq, index, nucleotide): + self._treatNucleotideDifferentThanRefSeqCase(refSeq, self.currentLineName, index, nucleotide, position) + elif(index < refSeqLength and self.isTheIndelOpen4ThisLine) : + self._treatSameNucleotideInOneIndel(refSeq, self.currentLineName, index, nucleotide, position) + index = index + 1 + self.currentNucleotide = nucleotide + self.currentPosition = position + + self.lIndelsOverAllLines = self.lIndelsOverAllLines + self.lIndelsOfTheCurrentLine + + self._postTraitementDetectSNP(self.currentLineName, self.currentNucleotide, self.currentPosition) + + def _manageCurrentIndividual(self, lineName): + self._lIndividualFileResults = self._completeIndividualListWithCurrentIndividual(self._lIndividualFileResults, lineName) + self._lBatchLineFileResults = self._completeBatchLineListWithCurrentIndividual(self._lBatchLineFileResults, self._lIndividualFileResults, lineName) + if not self._dIndividualNumbers4SubSNPResults.__contains__(lineName): + self._dIndividualNumbers4SubSNPResults[lineName] = len(self._lIndividualFileResults) + + + def _manageLastPositionIndels(self, lineName, nucleotide, position): + if (self.isTheIndelOpen4ThisLine): + self._closeTheCurrentIndel(lineName, nucleotide, position) + self.lIndelsOverAllLines.append(self.lIndelsOfTheCurrentLine.pop()) + + def _postTraitementDetectSNP(self, lineName, nucleotide, position): + self._manageLastPositionIndels(lineName, nucleotide, position) + + self._mergeAllelesAndSubSNPsFromOverlappingIndels() + self._addMissingsAllelesAndSubSNPs() + + self._lSubSNPFileResults = self._sortSubSNPResultByBatchPositionAndLineName(self._lSubSNPFileResults) + + def _manageSNPs(self, lineName, nucleotide, position): + self._dAlleleFileResults = self._completeAlleleSetWithCurrentAllele(self._dAlleleFileResults, nucleotide) + truePosition = self.getUngappedPositionInRefSeq(position) + subSNPName = self._formatSubSNPName(lineName, truePosition, Multifasta2SNPFile.POLYM_TYPE_4_SNP) + iAlleleNumber = self._dAlleleFileResults[nucleotide] + self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_SNP + flank5Prime, flank3Prime = self.getFlanksOfASubSNP(lineName, position, Multifasta2SNPFile.SNP_LENGTH, Multifasta2SNPFile.FLANK_LENGTH) + dSubSNPResult = {'subSNPName':subSNPName, 'position':truePosition, 'lineName':self._dIndividualNumbers4SubSNPResults[lineName], + 'allele':iAlleleNumber, 'batchNumber': self._currentBatchNumber, 'confidenceValue':self._sPolymConfidenceValue, + 'type':self._sPolymType, 'length': Multifasta2SNPFile.SNP_LENGTH, + '5flank':flank5Prime, '3flank':flank3Prime} + if(not self.subSNPExistsInSubSNPList(dSubSNPResult, self._lSubSNPFileResults)): + self._lSubSNPFileResults.append(dSubSNPResult) + + def _startAnIndel(self, position, nucleotide): + self.dcurrentIndel['start'] = position + self.dcurrentIndel['end'] = position + self.sCurrentIndelAllele = nucleotide + if(nucleotide == "-"): + self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_DELETION + else: + self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_INSERTION + self.isTheIndelOpen4ThisLine = 1 + + def _expandTheCurrentIndel(self, position, nucleotide): + self.sCurrentIndelAllele = self.sCurrentIndelAllele + nucleotide + self.dcurrentIndel['end'] = position + + def _closeTheCurrentIndel(self, lineName, nucleotide, position): + subSNPName = self._formatSubSNPName(lineName, self.dcurrentIndel['start'], self._sPolymType) + + dIndel4TheLine = {'name': subSNPName, 'lineName': lineName, 'start': self.dcurrentIndel['start'],'end' :self.dcurrentIndel['end'], + 'allele': self.sCurrentIndelAllele, 'type': self._sPolymType, 'length': self._iPolymLength} + + dIndel4TheLine['length'] = self.getAnIndelLength(dIndel4TheLine) + + self.lIndelsOfTheCurrentLine.append(dIndel4TheLine) + + self.dcurrentIndel.clear() + self.isTheIndelOpen4ThisLine = 0 + + + def _mergeAllelesAndSubSNPsFromOverlappingIndels(self): + lIndelList = [] + for dIndel in self.lIndelsOverAllLines: + lIndelList = self.clusteriseIndels(dIndel, self.lIndelsOverAllLines) + + for dIndel in lIndelList: + oldAllele = dIndel['allele'] + start = dIndel['start'] + stop = dIndel['end'] + lineName = dIndel['lineName'] + + LineBioSeq = self._wrapper._iLinesBioseqDB.fetch(lineName) + dIndel = self.updateAllele(oldAllele, start, stop, LineBioSeq, dIndel) + dSubSNPResult = self.createSubSNPFromAMissingPolym(dIndel, lineName) + if(not self.subSNPExistsInSubSNPList(dSubSNPResult, self._lSubSNPFileResults)): + self._lSubSNPFileResults.append(dSubSNPResult) + + def updateAllele(self, oldAllele, start, stop, LineBioSeq, dIndel): + #TODO: creer le test + newAllele = LineBioSeq.subseq(start, stop).sequence + if newAllele != oldAllele: + dIndel['allele'] = newAllele + self._dAlleleFileResults = self._completeAlleleSetWithCurrentAllele(self._dAlleleFileResults, newAllele) + return dIndel + + + def getFlanksOfASubSNP(self, lineName, subsnpPosition, polymLength, flankLength): + bioSeqOfTheLine = self._wrapper._iLinesBioseqDB.fetch(lineName) + flank5Prime = bioSeqOfTheLine.get5PrimeFlank(subsnpPosition, flankLength) + flank3Prime = bioSeqOfTheLine.get3PrimeFlank(subsnpPosition, flankLength, polymLength) + + return flank5Prime, flank3Prime + + def createSubSNPFromAMissingPolym(self, dIndel, lineName): + if(dIndel['type'] == Multifasta2SNPFile.POLYM_TYPE_4_INSERTION): + start = self.getUngappedPositionInRefSeq(dIndel['start']-1) + else: + start = self.getUngappedPositionInRefSeq(dIndel['start']) + + subSNPName = self._formatSubSNPName(dIndel['lineName'], start, dIndel['type']) + + iAlleleNumber = self._dAlleleFileResults[dIndel['allele']] + + iPolymLength = self.getAnIndelLength(dIndel) + + flank5Prime, flank3Prime = self.getFlanksOfASubSNP(lineName, dIndel['start'], iPolymLength, Multifasta2SNPFile.FLANK_LENGTH) + + dSubSNPResult = {'subSNPName':subSNPName, 'position':start, 'lineName':self._dIndividualNumbers4SubSNPResults[lineName], 'allele':iAlleleNumber, + 'batchNumber': self._currentBatchNumber, 'confidenceValue':self._sPolymConfidenceValue, 'type':dIndel['type'], + 'length': iPolymLength, '5flank':flank5Prime, '3flank':flank3Prime} + + return dSubSNPResult + + def clusteriseIndels(self, dIndel, lIndelsOverAllLines): + iIndice = 0 + for dIndel in lIndelsOverAllLines: + iIndice2Compare = 0 + for dIndel2Compare in lIndelsOverAllLines: + dIndel, dIndel2Compare = self.mergeBoundsForTwoOverlappingIndels(dIndel, dIndel2Compare) + lIndelsOverAllLines = self.updateBoundsForAnIndelInAnIndelList(lIndelsOverAllLines, dIndel) + lIndelsOverAllLines = self.updateBoundsForAnIndelInAnIndelList(lIndelsOverAllLines, dIndel2Compare) + iIndice2Compare = iIndice2Compare + 1 + iIndice = iIndice + 1 + + return lIndelsOverAllLines + + def mergeBoundsForTwoOverlappingIndels(self, dIndel1, dIndel2): + if((dIndel2['start'] <= dIndel1['start']) and (dIndel2['end'] >= dIndel1['start']) or + (dIndel1['start'] <= dIndel2['start']) and (dIndel1['end'] >= dIndel2['start'])): + if(dIndel1['start'] <= dIndel2['start']): + iStart = dIndel1['start'] + else: + iStart = dIndel2['start'] + + if(dIndel1['end'] >= dIndel2['end']): + iEnd = dIndel1['end'] + else: + iEnd = dIndel2['end'] + + dIndel1['start'] = iStart + dIndel1['end'] = iEnd + dIndel2['start'] = iStart + dIndel2['end'] = iEnd + + return dIndel1, dIndel2 + + def updateBoundsForAnIndelInAnIndelList(self, lIndelsList, dIndelWithNewBounds): + name = dIndelWithNewBounds['name'] + dIndelInTheList, iIndice = self.findAnIndelInAListWithHisName(name, lIndelsList) + lIndelsList.remove(dIndelInTheList) + lIndelsList.insert(iIndice, dIndelWithNewBounds) + return lIndelsList + + + def findASubSNPInAListWithHisName(self, name, lSubSNPList): + dSubSNP2Find = {} + indice = 0 + indice2Find = -1 + for dSubSNP in lSubSNPList: + if(dSubSNP['subSNPName'] == name): + dSubSNP2Find = dSubSNP + indice2Find = indice + indice = indice + 1 + + if dSubSNP2Find == {} or indice2Find == -1: + msg = "trying to find a SubSNP not existing: " + name + self._logFile.error(msg) + raise Exception ("trying to find a SubSNP not existing: " + name) + else: + return dSubSNP2Find, indice2Find + + def subSNPExistsInSubSNPList(self, dSubSNP2Find, lSubSNPList): + flag = 0 + for dSubSNP in lSubSNPList: + if(dSubSNP2Find['subSNPName'] == dSubSNP['subSNPName']): + flag = 1 + + if flag == 1: + return True + else: + return False + + + def findAnIndelInAListWithHisName(self, name, lIndelList): + dIndel2Find = {} + indice = 0 + indice2Find = -1 + for dIndel in lIndelList: + if(dIndel['name'] == name): + dIndel2Find = dIndel + indice2Find = indice + indice = indice + 1 + + if dIndel2Find == {} or indice2Find == -1: + msg = "trying to find an indel not existing: " + name + self._logFile.error(msg) + raise Exception (msg) + else: + return dIndel2Find, indice2Find + + def _addMissingsAllelesAndSubSNPs(self): + for dIndel in self.lIndelsOverAllLines: + start = dIndel['start'] + end = dIndel['end'] + type = dIndel['type'] + self.addMissingAllelesAndSubSNPsForOnePolym(start, end, type) + + for position in self.dSNPsPositions: + self.addMissingAllelesAndSubSNPsForOnePolym(position, position, "SNP") + + def addMissingAllelesAndSubSNPsForOnePolym(self, start, end, polymType): + refSeqAllele = self._wrapper._iReferenceBioseq.subseq(start, end).sequence + BioSeqDb = self._wrapper.getLinesBioseqInstances() + lBioSeqDbAlleles = self.getAllelesOfASubSeq(BioSeqDb, start, end) + for subSequence in lBioSeqDbAlleles: + if(subSequence['allele'] == refSeqAllele): + lineName = subSequence['header'] + dMissingPolym = {'lineName': lineName, 'start': start,'end' :end, + 'allele': subSequence['allele'], 'type':polymType} + self._dAlleleFileResults = self._completeAlleleSetWithCurrentAllele(self._dAlleleFileResults, subSequence['allele']) + dSubSNPResult = self.createSubSNPFromAMissingPolym(dMissingPolym, lineName) + if(not self.subSNPExistsInSubSNPList(dSubSNPResult, self._lSubSNPFileResults)): + self._lSubSNPFileResults.append(dSubSNPResult) + + def addOnePolymorphicPosition(self, position): + if(not self.dSNPsPositions.has_key(position)): + self.dSNPsPositions[position] = 1 + + def getUngappedPositionInRefSeq(self, position): + if(position ==1): + nbOfGaps = 0 + else: + seqIn5Prime = self._wrapper._iReferenceBioseq.subseq(1, position-1).sequence + nbOfGaps = seqIn5Prime.count("-") + + return position - nbOfGaps + + def getAllelesOfASubSeq(self, BioSeqDb, start, end): + lAlleles = [] + for iBioSeq in BioSeqDb: + dAlleles = {} + dAlleles['header'] = iBioSeq.header + dAlleles['allele'] = iBioSeq.subseq(start, end).sequence + lAlleles.append(dAlleles) + + return lAlleles + + def getAnIndelLength(self, dIndel): + length = 0 + if(dIndel['type'] == Multifasta2SNPFile.POLYM_TYPE_4_DELETION): + length = dIndel['end'] - dIndel['start'] + 1 + else: + length = len(dIndel['allele']) + + return length + + def createWrapperFromFile(self, inFileName): + faF = open(inFileName, "r") + iBioSeqDB = self._extractSequencesFromInputFile(faF) + faF.close() + + iBioSeqDB.upCase() + referenceBioseq = iBioSeqDB[0] + linesBioSeqDB = iBioSeqDB.extractPart(1, iBioSeqDB.getSize() - 1) + + try: + if(FileUtils.isEmpty(inFileName)): + msg = "The input file is empty!" + self._logFile.error(self._prefixeWithLineNumber (msg)) + raise Exception (self._prefixeWithFileName (msg)) + if(self.isHeaderInRefSeqList(referenceBioseq.header)): + msg = "This reference sequence already exists in one previous file!" + self._logFile.error(self._prefixeWithLineNumber (msg)) + raise Exception (self._prefixeWithLineNumber (msg)) + except Exception, e : + raise Exception ("Problem with one input file: \n" + str(e)) + + self._lRefSequences.append(referenceBioseq) + + return ReferenceBioseqAndLinesBioseqDBWrapper(referenceBioseq, linesBioSeqDB, self._logFile, inFileName) + + def isHeaderInRefSeqList(self, header): + isHeader = False + for item in self._lRefSequences: + if item.header == header: + isHeader = True + return isHeader + + def completeBatchList(self): + dBatchResults = {'BatchNumber' : self._currentBatchNumber, 'BatchName' : self._batchName, 'GeneName' : self._geneName,'ContactNumber' : "1", + 'ProtocolNumber' : "1", 'ThematicNumber' : "1", 'RefSeqName': self._wrapper._iReferenceBioseq.header} + + self._lBatchFileResults.append(dBatchResults) + + return self._lBatchFileResults + + def getLineAsAHeader(self, lineToBeCheck, lineNumber = 0): + """ + header line begin with the tag(or token) '>' tag + ended with an carriage return + contain The name of sequence must respect this alphabet [a-zA-Z0-9_-:] + """ + obsHeader = lineToBeCheck + if obsHeader[0]!=">" : + msg = "tag '>' omitted before header" + self._logFile.error(self._prefixeWithLineNumber (msg)) + raise Exception (self._prefixeWithLineNumber (msg)) + else : + obsHeader = obsHeader[1:] + obsHeader = obsHeader.replace ("\n","") + obsHeader = self._removeRepeatedBlanksInAStr(obsHeader) + obsHeader = self._replaceBlankByUnderScoreInAStr(obsHeader) + if self.checkHeaderAlphabet(obsHeader) : + return obsHeader + self._logFile.error(self._prefixeWithLineNumber ("fatal error on header")) + raise Exception (self._prefixeWithLineNumber ("fatal error on header")) + + def getLineAsASeq(self, lineToBeCheck): + """ + Sequence line + ended with an carriage return + contain only character of the IUPAC alphabet + """ + obsSeq = str.upper(lineToBeCheck) + obsSeq = obsSeq.replace ("\n","") + obsSeq = obsSeq.replace ("\r","") + obsLine = obsSeq.replace("-","") + if not self.isIUPAC_bases(obsLine) : + msg = "the sequence contain a non nucleic character " + self._logFile.error(self._prefixeWithLineNumber (msg)) + raise Exception (self._prefixeWithLineNumber (msg)) + return obsSeq + + def checkHeaderAlphabet( self, strToCheck): + """ + Check the string + the string is not a header when founding a pattern not corresponding to the regexp + \W Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_-:]. + """ + if strToCheck=="": + return False + p = re.compile('[^a-zA-Z0-9_:\-]', re.IGNORECASE) #p = re.compile('(\W|-|:)+', re.IGNORECASE) + errList=p.findall(strToCheck) + if len( errList ) > 0 : + return False + else: + return True + + ## Check the string is nucleotides sequence from the DNA_ALPHABET_WITH_N = ["A","T","G","C","N"] of IUPAC nomenclature. + # @return True if sequence contain A, T, G, C or N False otherwise + # + def isDNA_bases( self, sequence): + if sequence == "" : + return False + + setFromString = set() + + for nt in sequence : + setFromString.add(nt) + + return setFromString.issubset(DNA_ALPHABET_WITH_N_AND_DELS) + + ## Check if the string is nucleotides sequence from the IUPAC ALPHABET . + # @return True if sequence contain IUPAC letters False otherwise + # + def isIUPAC_bases( self, sequence): + if sequence == "" : + return False + + setFromString = set() + + for nt in sequence : + setFromString.add(nt) + + return setFromString.issubset(IUPAC) + + def _writeAllOutputFiles(self): + writer = Multifasta2SNPFileWriter() + writer.write(self) + + def _sortSubSNPResultByBatchPositionAndLineName(self, lSubSNPResults): + return sorted(lSubSNPResults, key=lambda SNPresults: (SNPresults['batchNumber'], SNPresults['position'], SNPresults['lineName'])) + + def _formatSubSNPName(self, currentLineHeader, position, polymType): + shortPolymType = polymType[:3] + return self._batchName + "_" + shortPolymType + "_" + str(position) + "_" + currentLineHeader + + def _isSNPDetected(self, referenceSequence, index, nt): + if((nt != referenceSequence[index]) and (nt.upper() != "N") and (referenceSequence[index].upper() != "N")): + return True + else: + return False + + def _extractSequencesFromInputFile(self, inFile): + # attention : DNA_ALPHABET_WITH_N_AND_DELS = Set (['A','T','G','C','N']) no including "gap" + lInFileLines = inFile.readlines() + nbOfLines = len(lInFileLines) - 1 + #premiere lecture + self._iCurrentLineNumber = 0 + isSameSeq = False + newSeq = "" + bioseqDB = BioseqDB () + while self._iCurrentLineNumber < nbOfLines : + bioseq = Bioseq() + bioseq.header = self.getLineAsAHeader( lInFileLines[self._iCurrentLineNumber] ) + isSameSeq = True + while isSameSeq and (self._iCurrentLineNumber < nbOfLines) : + self._iCurrentLineNumber +=1 + if lInFileLines[self._iCurrentLineNumber][0] == ">" : + isSameSeq = False + else : + newSeq = newSeq + self.getLineAsASeq( lInFileLines[self._iCurrentLineNumber] ) + isSameSeq = True + bioseq.setSequence(newSeq) + newSeq = "" + bioseqDB.add(bioseq) + return bioseqDB + + def _removeRepeatedBlanksInAStr (self, StrToClean ): + resStr=StrToClean.expandtabs(2) + compResStr=resStr.replace (" "," ") + while compResStr != resStr : + resStr=compResStr + compResStr=resStr.replace (" "," ") + return resStr + + def _replaceBlankByUnderScoreInAStr (self, StrToClean ): + resStr = StrToClean.replace (" ","_") + return resStr + + def _prefixeWithLineNumber (self, strMsg): + resStr = "File: " + self._currentFileName + "\t" + resStr += "Line %i " % (self._iCurrentLineNumber+1 ) + strMsg + return resStr + + def _prefixeWithFileName (self, strMsg): + resStr = "File: " + self._currentFileName + "\n" + strMsg + return resStr + + + def _completeAlleleSetWithCurrentAllele(self, dAlleleFileResults, dnaBase): + if dAlleleFileResults.has_key(dnaBase): + return dAlleleFileResults + else: + iAlleleNumber = len(dAlleleFileResults) + 1 + dAlleleFileResults[dnaBase] = iAlleleNumber + return dAlleleFileResults + + def _completeIndividualListWithCurrentIndividual(self, lIndividualResults, lineName): + if lIndividualResults == []: + iIndividualNumber = 1 + else: + iIndividualNumber = len(lIndividualResults) + 1 + + #TODO: transformer la liste de dictionnaire en liste d'objets + if not (self._checkIfALineExistInList(lIndividualResults, lineName)): + dIndividual2Add = {'individualNumber': iIndividualNumber, 'individualName': lineName, 'scientificName': self._taxon} + lIndividualResults.append(dIndividual2Add) + + return lIndividualResults + + def _completeBatchLineListWithCurrentIndividual(self, lBatchLineResults, lIndividualResults, lineName): + lineDict = self._getALineDictFromADictListWithALineName(lIndividualResults, lineName) + + if len(lineDict) != 0: + if(lineDict.has_key('individualNumber')): + indivNumberOfTheLineDict = lineDict['individualNumber'] + else: + msg = "Problem with the batchLine results construction: individual named " + lineName + " has no individual number!" + self._logFile.error(msg) + raise Exception (msg) + else: + msg = "Problem with the batchLine results construction: individual named " + lineName + " not in individual list!" + self._logFile.error(msg) + raise Exception (msg) + + dResults2Add = {'IndividualNumber': str(indivNumberOfTheLineDict), 'BatchNumber' : self._currentBatchNumber} + lBatchLineResults.append(dResults2Add) + return lBatchLineResults + + def _getALineDictFromADictListWithALineName(self, lDictList, lineName): + dictToReturn = {} + for myDict in lDictList: + if myDict['individualName'] == lineName: + dictToReturn = myDict + + return dictToReturn + + def _checkIfALineExistInList(self, lDictList, lineName): + for myDict in lDictList: + if myDict['individualName'] == lineName: + return True + return False + + def _getCurrentBatchResult(self): + return self._lBatchFileResults[self._currentBatchNumber-1] + + + + +class ReferenceBioseqAndLinesBioseqDBWrapper (object): + + def __init__ (self, iReferenceBioseq, iLinesBioSeqDB, logger, fileName): + self._iReferenceBioseq = iReferenceBioseq + self._iLinesBioseqDB = iLinesBioSeqDB + self._logger = logger + self._currentFileName = fileName + self._checkAllSeqs() + + + def _checkAllSeqs(self): + self._iReferenceBioseq.checkEOF() + refSeqLen = self._iReferenceBioseq.getLength() + + for seq in self._iLinesBioseqDB.db: + seq.checkEOF() + if(not seq.getLength() == refSeqLen): + msg = "File: " + self._currentFileName + ", problem with the sequence " + seq.header + ": its length is different from the reference seq! All the sequences must have the same length.\n" + msg += "refseq length: " + str(refSeqLen) + "\n" + msg += "seq length: " + str(seq.getLength()) + "\n" + self._logger.error(msg) + raise Exception (msg) + + def getLinesBioseqInstances(self): + return self._iLinesBioseqDB.db + + def getReferenceSequence(self): + return self._iReferenceBioseq.sequence + +class Multifasta2SNPFileWriter(object): + + SUB_SNP_FILE_HEADER = ["SubSNPName","ConfidenceValue","Type","Position","5flank", + "3flank","Length","BatchNumber","IndividualNumber","PrimerType","PrimerNumber","Forward_or_Reverse","AlleleNumber"] + + ALLELE_FILE_HEADER = ["AlleleNumber","Value","Motif","NbCopy","Comment"] + + INDIVIDUAL_FILE_HEADER = ["IndividualNumber","IndividualName","Description","AberrAneuploide", + "FractionLength","DeletionLineSynthesis","UrlEarImage","TypeLine","ChromNumber","ArmChrom","DeletionBin","ScientificName", + "local_germplasm_name","submitter_code","local_institute","donor_institute","donor_acc_id"] + + SEQUENCE_CSV_FILE_HEADER = ["SequenceName","SeqType","BankName","BankVersion","ACNumber","Locus","ScientificName"] + + BATCH_TXT_FILE_HEADER = ["BatchNumber", "BatchName", "GeneName", "Description", "ContactNumber", "ProtocolNumber", "ThematicNumber", "RefSeqName", "AlignmentFileName", "SeqName"] + + BATCH_LINE_FILE_HEADER = ["IndividualNumber", "Pos5", "Pos3", "BatchNumber", "Sequence"] + + def __init__(self): + self._csvFieldSeparator = ";" + self._txtSubFieldSeparator = ": " + self._txtFieldSeparator = "\n" + self._primerType = "Sequence" + self._csvLineSeparator = "\n" + self._txtLineSeparator = "//\n" + + def write(self, iMultifasta2SNPFile): + self._writeSubSNPFile(iMultifasta2SNPFile._outSubSNPFileName, iMultifasta2SNPFile._lSubSNPFileResults) + self._writeAlleleFile(iMultifasta2SNPFile._outAlleleFileName, iMultifasta2SNPFile._dAlleleFileResults) + self._writeIndividualFile(iMultifasta2SNPFile._outIndividualFileName, iMultifasta2SNPFile._lIndividualFileResults) + self._writeSequenceFiles(iMultifasta2SNPFile._outSequenceFSAFileName, iMultifasta2SNPFile._outSequenceCSVFileName, iMultifasta2SNPFile._lRefSequences, iMultifasta2SNPFile._taxon) + self._writeBatchFile(iMultifasta2SNPFile._outBatchFileName, iMultifasta2SNPFile._lBatchFileResults) + self._writeBatchLineFile(iMultifasta2SNPFile._outBatchLineFileName, iMultifasta2SNPFile._lBatchLineFileResults) + + def sortAlleleResultByAlleleNumber(self, dAlleleFileResults): + return sorted(dAlleleFileResults.items(), key=lambda(k,v):(v,k)) + + def _writeSubSNPFile(self, subSNPFileName, lSNP2Write): + outF = open(subSNPFileName, "w") + self._writeSNPFileHeader(outF) + for dSNP in lSNP2Write: + self._writeSNPFileLine(outF, dSNP) + outF.close() + + def _writeAlleleFile(self, alleleFileName, dAllele2Write): + outF = open(alleleFileName, "w") + self._writeAlleleFileHeader(outF) + lAlleleSortedResults = self.sortAlleleResultByAlleleNumber(dAllele2Write) + for tAllele in lAlleleSortedResults: + self._writeAlleleFileLine(outF, tAllele[0], tAllele[1]) + + outF.close() + + def _writeIndividualFile(self, individualFileName, lIndividual2Write): + sorted(lIndividual2Write, key=lambda Individual: (Individual['individualNumber'])) + outF = open(individualFileName, "w") + self._writeIndividualFileHeader(outF) + + for dIndiv in lIndividual2Write: + self._writeIndividualFileLine(outF, dIndiv) + + outF.close() + + def _writeSequenceFiles(self, sequenceFSAFileName, sequenceCSVFileName, lRefSequences, taxon): + outFSA = open(sequenceFSAFileName, "w") + outCSV = open(sequenceCSVFileName, "w") + self._writeSequenceCSVHeader(outCSV) + + for refSeq in lRefSequences: + refSeq.cleanGap() + self._writeSequenceFSAFile(outFSA, refSeq) + self._writeSequenceCSVLine(outCSV, refSeq, taxon) + + outFSA.close() + outCSV.close() + + def _writeSequenceFSAFile(self, outF, refSeq): + outF.write( ">%s\n" % ( refSeq.header ) ) + outF.write( "%s\n" % ( refSeq.sequence[0:refSeq.getLength()] ) ) + + + def _writeBatchFile(self, batchFileName, lBatchResults): + outF = open(batchFileName, "w") + for dBatchResults in lBatchResults: + for head in Multifasta2SNPFileWriter.BATCH_TXT_FILE_HEADER[:]: + if dBatchResults.has_key(head): + outF.write(head + self._txtSubFieldSeparator + str(dBatchResults[head]) + self._txtFieldSeparator) + else: + outF.write(head + self._txtSubFieldSeparator + self._txtFieldSeparator) + + outF.write(self._txtLineSeparator) + + outF.close() + + def _writeBatchLineFile(self, batchLineFileName, lBatchLineResults): + outF = open(batchLineFileName, "w") + self._writeBatchLineFileHeader(outF) + for dResult in lBatchLineResults: + self._writeBatchLineFileLine(outF, dResult) + outF.close() + + def _writeSNPFileHeader(self, outF): + for head in Multifasta2SNPFileWriter.SUB_SNP_FILE_HEADER[:-1]: + outF.write(head + self._csvFieldSeparator) + outF.write(Multifasta2SNPFileWriter.SUB_SNP_FILE_HEADER[-1] + self._csvLineSeparator) + + def _writeAlleleFileHeader(self, outF): + for head in Multifasta2SNPFileWriter.ALLELE_FILE_HEADER[:-1]: + outF.write(head + self._csvFieldSeparator) + outF.write(Multifasta2SNPFileWriter.ALLELE_FILE_HEADER[-1] + self._csvLineSeparator) + + def _writeIndividualFileHeader(self, outF): + for head in Multifasta2SNPFileWriter.INDIVIDUAL_FILE_HEADER[:-1]: + outF.write(head + self._csvFieldSeparator) + outF.write(Multifasta2SNPFileWriter.INDIVIDUAL_FILE_HEADER[-1] + self._csvLineSeparator) + + def _writeSequenceCSVHeader(self, outF): + for head in Multifasta2SNPFileWriter.SEQUENCE_CSV_FILE_HEADER[:-1]: + outF.write(head + self._csvFieldSeparator) + outF.write(Multifasta2SNPFileWriter.SEQUENCE_CSV_FILE_HEADER[-1] + self._csvLineSeparator) + + def _writeBatchLineFileHeader(self, outF): + for head in Multifasta2SNPFileWriter.BATCH_LINE_FILE_HEADER[:-1]: + outF.write(head + self._csvFieldSeparator) + outF.write(Multifasta2SNPFileWriter.BATCH_LINE_FILE_HEADER[-1] + self._csvLineSeparator) + + def _writeSNPFileLine(self, outF, dSNP): + outF.write(dSNP['subSNPName'] + self._csvFieldSeparator) + outF.write(dSNP['confidenceValue'] + self._csvFieldSeparator + dSNP['type'] + self._csvFieldSeparator) + outF.write(str(dSNP['position']) + self._csvFieldSeparator + dSNP['5flank'] + self._csvFieldSeparator + dSNP['3flank'] + self._csvFieldSeparator) + outF.write(str(dSNP['length']) + self._csvFieldSeparator + str(dSNP['batchNumber']) + self._csvFieldSeparator) + outF.write(str(dSNP['lineName']) + self._csvFieldSeparator) + outF.write(self._primerType + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + str(dSNP['allele']) + self._csvLineSeparator) + + def _writeAlleleFileLine(self, outF, sAllele2Write, iAlleleNumber): + outF.write(str(iAlleleNumber) + self._csvFieldSeparator) + outF.write(sAllele2Write + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvLineSeparator) + + def _writeIndividualFileLine(self, outF, dIndividual): + outF.write(str(dIndividual['individualNumber']) + self._csvFieldSeparator) + outF.write(dIndividual['individualName'] + self._csvFieldSeparator + self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator) + outF.write(dIndividual['scientificName'] + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator+ self._csvFieldSeparator + self._csvFieldSeparator + self._csvLineSeparator) + + def _writeSequenceCSVLine(self, outF, refSeq, taxon): + outF.write(refSeq.header + self._csvFieldSeparator) + outF.write("Reference" + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator) + outF.write(taxon + self._csvLineSeparator) + + def _writeBatchLineFileLine(self, outF, dResult): + outF.write(str(dResult['IndividualNumber']) + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator) + outF.write(str(dResult['BatchNumber']) + self._csvFieldSeparator + self._csvLineSeparator) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/MummerParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MummerParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,93 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping + +class MummerParser(MapperParser): + """A class that parses the output of Mummer format""" + + def __init__(self, fileName, verbosity = 0): + super(MummerParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(MummerParser, self).__del__() + + + def getFileFormats(): + return ["mummer"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + mapping = Mapping() + + subMapping = SubMapping() + + # handle header + m = re.search(r"^>\s+(\S+)\s+Reverse\s+Len\s+=\s+(\d+)$", line) + if m != None: + subMapping.queryInterval.setName(m.group(1)) + subMapping.queryInterval.setSize(int(m.group(2))) + subMapping.queryInterval.setDirection(-1) + else: + m = re.search(r"^>\s+(\S+)\s+Len\s+=\s+(\d+)$", line) + if m != None: + subMapping.queryInterval.setName(m.group(1)) + subMapping.queryInterval.setSize(int(m.group(2))) + subMapping.queryInterval.setDirection(1) + else : + sys.exit("Header line %d '%s' is strange in Mummer file" % (self.currentLineNb, line)) + + for line in self.handle: + self.currentLineNb += 1 + break + line = line.strip() + + # handle line + m = re.search(r"^(\w+)\s+(\d+)\s+(\d+)\s+(\d+)$", line) + if m != None: + subMapping.targetInterval.setName(m.group(1)) + subMapping.targetInterval.setStart(int(m.group(2))) + subMapping.queryInterval.setStart(int(m.group(3))) + subMapping.targetInterval.setSize(int(m.group(4))) + else: + sys.exit("Line %d '%s' is strange in Mummer file" % (self.currentLineNb, line)) + + mapping.addSubMapping(subMapping) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/NCListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/NCListParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,125 @@ +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + + +class NCListParser(TranscriptListParser): + + + def __init__(self, fileName, verbosity = 0): + self.title = None + TranscriptListParser.__init__(self, fileName, verbosity) + self.parse() + + def getFileFormats(): + return ["nclist"] + getFileFormats = staticmethod(getFileFormats) + + def skipFirstLines(self): + return + + def parse(self): + handle = open(self.fileName) + self.sortedFileNames = pickle.load(handle) + self.nbElements = pickle.load(handle) + self.nbElementsPerChromosome = pickle.load(handle) + self.ncLists = pickle.load(handle) + for ncList in self.ncLists.values(): + ncList._reopenFiles() + handle.close() + self.chromosomes = sorted(self.nbElementsPerChromosome.keys()) + self.fileNames = dict([chromosome, self.ncLists[chromosome]._transcriptFileName] for chromosome in self.chromosomes) + self.currentReader = None + self.currentChrIndex = 0 + + def getSortedFileNames(self): + return self._sortedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def reset(self): + self.currentChrIndex = 0 + self.currentReader = None + + def gotoAddress(self, address): + self.currentReader.gotoAddress(address) + + def getCurrentAddress(self): + return self.getCurrentTranscriptAddress() + + def getCurrentTranscriptAddress(self): + if self.currentReader == None: + return 0 + return self.currentReader.getCurrentTranscriptAddress() + + def getNextTranscript(self): + if self.currentReader == None: + self.currentReader = NCListFileUnpickle(self.fileNames[self.chromosomes[0]]) + transcript = self.currentReader.getNextTranscript() + if transcript == False: + self.currentChrIndex += 1 + if self.currentChrIndex >= len(self.chromosomes): + return None + self.currentReader = NCListFileUnpickle(self.fileNames[self.chromosomes[self.currentChrIndex]]) + transcript = self.currentReader.getNextTranscript() + return transcript + + def getInfos(self): + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting information on %s." % (self.fileName), self.verbosity-9) + transcript = self.getNextTranscript() + for transcript in self.getIterator(): + self.size += transcript.getSize() + progress.inc() + progress.done() + self.reset() + + def getNbTranscripts(self): + return self.nbElements diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/NucmerParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/NucmerParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,88 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.Interval import Interval +from commons.core.parsing.MapperParser import MapperParser + + +class NucmerParser(MapperParser): + """A class that parses the output of Nucmer""" + + def __init__(self, fileName, verbosity = 0): + super(NucmerParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(NucmerParser, self).__del__() + + + def getFileFormats(): + return ["nucmer"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + if not line: + return None + if line[0] == ">": + self.currentChromosome = line[1:].split()[0] + return None + splittedLine = line.strip().split() + if len(splittedLine) != 8: + raise Exception("Line %d '%s' does not have a NucMer format" % (self.currentLineNb, line)) + + subMapping = SubMapping() + + subMapping.targetInterval.setChromosome(self.currentChromosome) + subMapping.targetInterval.setName(self.currentChromosome) + subMapping.targetInterval.setStart(min(int(splittedLine[0]), int(splittedLine[1]))) + subMapping.targetInterval.setEnd(max(int(splittedLine[0]), int(splittedLine[1]))) + subMapping.targetInterval.setDirection(splittedLine[6]) + + subMapping.queryInterval.setChromosome(splittedLine[7]) + subMapping.queryInterval.setName(splittedLine[7]) + subMapping.queryInterval.setStart(1) + subMapping.queryInterval.setEnd(int(splittedLine[3])) + subMapping.queryInterval.setDirection("+") + + mapping = Mapping() + mapping.addSubMapping(subMapping) + mapping.setDirection(splittedLine[6]) + mapping.setIdentity(float(splittedLine[5])) + mapping.setSize(int(splittedLine[3])) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/PalsToAlign.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PalsToAlign.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,66 @@ +import time +import os + +class PalsToAlign(object): + """ + Convert the output from PALS (GFF2 format) into the 'align' format. + """ + def __init__(self,inputPalsFileName="" , outputAlignFileName="", removeSameSequences=False): + self._removeSameSequences = removeSameSequences + self._inputPalsFileName = inputPalsFileName + self._outputAlignFileName = outputAlignFileName + + def run (self): + file = open(self._inputPalsFileName, "r") + tmpFileName = "PalsToAlign%s"%str(os.getpid() ) + tmpFile = open(tmpFileName, "w") + + for line in file.readlines(): + + if line == "": + break + + data = line.split("\t") + + qryName = data[0] + source = data[1] + feature = data[2] + qryStart = data[3] + qryEnd = data[4] + score = data[5] + strand = data[6] + frame = data[7] + attributes = data[8][:-1].split() + + sbjName = attributes[1] + sbjStart = attributes[2] + sbjEnd = attributes[3][:-1] + percId = (1 - float(attributes[-1])) * 100.0 + + if strand != "+": + tmp = sbjStart + sbjStart = sbjEnd + sbjEnd = tmp + + if self._removeSameSequences \ + and "chunk" in qryName and "chunk" in sbjName \ + and min(int(qryStart), int(qryEnd)) == 1 \ + and min(int(sbjStart), int(sbjEnd)) == 1 \ + and percId == 100.0: + line = self.inFile.readline() + continue + + if qryStart < qryEnd: + alignLine = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (qryName, qryStart, qryEnd, sbjName, sbjStart, sbjEnd, "0.0", score, percId) + else: + alignLine = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (qryName, qryEnd, qryStart, sbjName, sbjEnd, sbjStart, "0.0", score, percId) + + tmpFile.write(alignLine) + + file.close() + tmpFile.close() + + os.system("sort -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n %s > %s" % (tmpFileName, self._outputAlignFileName)) + os.remove(tmpFileName) + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/ParserChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ParserChooser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,129 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from commons.core.parsing.MapperParser import MapperParser +from commons.core.parsing.SequenceListParser import SequenceListParser +from commons.core.parsing.BedParser import BedParser +from commons.core.parsing.GffParser import GffParser +from commons.core.parsing.MapperParser import MapperParser +from commons.core.parsing.CoordsParser import CoordsParser +from commons.core.parsing.SeqmapParser import SeqmapParser +from commons.core.parsing.SoapParser import SoapParser +from commons.core.parsing.Soap2Parser import Soap2Parser +from commons.core.parsing.BlastParser import BlastParser +from commons.core.parsing.PslParser import PslParser +from commons.core.parsing.RmapParser import RmapParser +from commons.core.parsing.ShrimpParser import ShrimpParser +from commons.core.parsing.AxtParser import AxtParser +from commons.core.parsing.ExoParser import ExoParser +from commons.core.parsing.MaqParser import MaqParser +from commons.core.parsing.SamParser import SamParser +from commons.core.parsing.BamParser import BamParser +from commons.core.parsing.BowtieParser import BowtieParser +from commons.core.parsing.ElandParser import ElandParser +from commons.core.parsing.GtfParser import GtfParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.parsing.MapParser import MapParser +from commons.core.parsing.WigParser import WigParser +from commons.core.parsing.NCListParser import NCListParser +from commons.core.parsing.PklParser import PklParser + +#Attention!! Do not delete the imports!! They are used to know the type of file format!!! + +class ParserChooser(object): + """ + A class that finds the correct parser + @ivar format: the format + @type format: string + @ivar type: transcript / mapping / sequence parser + @type type: string + @ivar parser: the parser + @type parser: object + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.type = None + self.parserClass = None + self.verbosity = verbosity + + + def findFormat(self, format, type = None): + """ + Find the correct parser + @ivar format: the format + @type format: string + @ivar type: transcript / mapping / sequence parser (None is all) + @type type: string + @return: a parser + """ + classes = {} + if (type == "transcript"): + classes = {TranscriptListParser: "transcript"} + elif (type == "mapping"): + classes = {MapperParser: "mapping"} + elif (type == "sequence"): + classes = {SequenceListParser: "sequence"} + elif (type == None): + classes = {TranscriptListParser: "transcript", MapperParser: "mapping", SequenceListParser: "sequence"} + else: + raise Exception("Do not understand format type '%s'" % (type)) + + for classType in classes: + for parserClass in classType.__subclasses__(): + if format in parserClass.getFileFormats(): + self.parserClass = parserClass + self.type = classes[classType] + return + raise Exception("Cannot get parser for format '%s'" % (format)) + + + def getParser(self, fileName): + """ + Get the parser previously found + @return: the parser + """ + return self.parserClass(fileName, self.verbosity) + + + def getType(self): + """ + Get the type of parser previously found + @return: the type of parser + """ + return self.type diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/PathNum2Id.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PathNum2Id.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,47 @@ +class PathNum2Id( object ): + + def __init__(self): + self._inFileName = None + self._outFileName = None + + def setInFileName(self, fileName): + self._inFileName = fileName + + def setOutFileName(self, fileName): + self._outFileName = fileName + + def run( self ): + """ + Adapt the path IDs as the input file is the concatenation of several 'path' files. + """ + self._inFile = open( self._inFileName, "r" ) + self._outFile = open( self._outFileName, "w" ) + lines = self._inFile.readlines() + dID2count = {} + count = 1 + for line in lines: + if line == "": + break + strippedLine = line.strip('\n') + data = strippedLine.split("\t") + path = data[0] + qryName = data[1] + qryStart = int(data[2]) + qryEnd = int(data[3]) + sbjName = data[4] + sbjStart = int(data[5]) + sbjEnd = int(data[6]) + BLAST_Eval = data[7] + BLAST_score = data[8] + percId = data[9] + key_id = path + "-" + qryName + "-" + sbjName + if key_id not in dID2count.keys(): + newPath = count + count += 1 + dID2count[ key_id ] = newPath + else: + newPath = dID2count[ key_id ] + cmd = "%i\t%s\t%i\t%i\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( newPath, qryName, qryStart, qryEnd, sbjName, sbjStart, sbjEnd, BLAST_Eval, BLAST_score, percId ) + self._outFile.write( cmd ) + self._inFile.close() + self._outFile.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/PilerTAToGrouperMap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PilerTAToGrouperMap.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,85 @@ +import time +import os + +class PilerTAToGrouperMap(object): + """ + Convert the output file from Piler into grouper format. + """ + def __init__(self, inputGffFileName, inputPYRFileName, inputMOTIFFileName, outputFileName): + self._inputGffFileName = inputGffFileName + self._inputPYRFileName = inputPYRFileName + self._inputMOTIFFileName = inputMOTIFFileName + self._outFileName = outputFileName + + def run (self): + inFileGff = open( self._inputGffFileName, "r" ) + inFilePyr = open( self._inputPYRFileName, "r" ) + outFile = open(self._outFileName,"w") + + #step 0 : get pile Info and write out an info file + for pyrLine in inFilePyr.readlines():#-tan_pyr.gff + if pyrLine == "": + break + pileIndex = "" + pyrIndex = pyrLine.split('\t')[8].replace ('PyramidIndex', 'Pyramid') + for gffLine in inFileGff.readlines(): #-tan.gff + if gffLine == "": + break + if pyrIndex in gffLine: + pileIndex = gffLine.split(';')[1].strip() + break + line = "%s\t%s" % (pileIndex, pyrIndex) + outFile.write(line) + + inFilePyr.close() + inFileGff.close() + outFile.close() + + #Step 1 : Add pile info to motif file and write out two files one with grouperID and one in map format + outFileMotifGrpFileName = self._inputMOTIFFileName + ".grp" + outFileMotifGrpMapFileName = self._inputMOTIFFileName + ".grp.map" + + inFileInfo = open(self._outFileName,"r") + inFileMotif = open(self._inputMOTIFFileName, "r" ) + outFileMotifGrp = open(outFileMotifGrpFileName, "w" ) + outFileMotifGrpMap = open(outFileMotifGrpMapFileName, "w" ) + + inFileInfos = inFileInfo.readlines() + lineInfoIndex = 0 + + for countMotif,lineMotif in enumerate(inFileMotif.readlines()): + if lineMotif == "": + break + dataMotif = lineMotif.split(';') + motif, pyrNameMotif = dataMotif[:2] + pyrNameMotif = pyrNameMotif.strip() + pileNameMotif = "" + + while lineInfoIndex < len(inFileInfos): + lineInfo = inFileInfos[lineInfoIndex] + if lineInfo == "": + break + if pyrNameMotif in lineInfo: + pileNameMotif = lineInfo.split('\t')[0] + break + lineInfoIndex +=1 + + #translate to Grouper IdFormat + pyrID = pyrNameMotif.split(' ')[1] + pileID = pileNameMotif.split(' ')[1] + dataMotif = motif.split ('\t') + chrm = dataMotif [0] + start,end = dataMotif [3:5] + countMotif += 1 + memberID = "MbS%sGr" % (countMotif) + pyrID + "Cl" + pileID + + stringMotif = "%s\t%s\t%s\t%s\n" % ( memberID, motif, pileNameMotif, pyrNameMotif) + outFileMotifGrp.write( stringMotif) + + stringGrpMap = "%s\t%s\t%s\t%s\n" % ( memberID, chrm, start, end ) + outFileMotifGrpMap.write( stringGrpMap ) + + inFileMotif.close() + inFileInfo.close() + outFileMotifGrp.close() + outFileMotifGrpMap.close() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/PklParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PklParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,112 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.structure.Interval import Interval +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript + + +class PklParser(TranscriptListParser): + """A class that parses the intern PKL file and create a transcript list""" + + def __init__(self, fileName, verbosity = 1): + self.title = None + super(PklParser, self).__init__(fileName, verbosity) + self.handle = open(fileName, "rb") + self.verbosity = verbosity + self.initAddress = 0 + self.address = self.initAddress + self.over = False + self.chromosome = None + + def __del__(self): + super(PklParser, self).__del__() + + def getFileFormats(): + return ["pkl"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + return + + + def reset(self): + self.handle.seek(0) + self.initAddress = 0 + + + def setChromosome(self, chromosome): + self.chromosome = chromosome + + + def gotoAddress(self, address): + self.handle.seek(address) + self.address = address + + + def getNextTranscript(self): + self.address = self.handle.tell() + try: + transcript = pickle.load(self.handle) + if self.chromosome != None and transcript.getChromosome() != self.chromosome: + self.over = True + return False + return transcript + except EOFError: + self.over = True + return False + + + def getIterator(self): + self.gotoAddress(self.initAddress) + while True: + transcript = self.getNextTranscript() + if not transcript: + self.over = True + return + yield transcript + + + def setInitAddress(self, address): + self.initAddress = address + + + def getCurrentTranscriptAddress(self): + return self.address + + + def isOver(self): + return self.over diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/PslParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PslParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,155 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class PslParser(MapperParser): + """A class that parses the output of PSL format (of SSAHA and BLAT)""" + + def __init__(self, fileName, verbosity = 0): + super(PslParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(PslParser, self).__del__() + + + def getFileFormats(): + return ["psl"] + getFileFormats = staticmethod(getFileFormats) + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting info on PSL file, # mappings read:", self.verbosity) + for line in self.handle: + progress.inc() + line = line.strip() + if line == "": + continue + parts = line.split("\t") + chromosome = parts[13] + self.chromosomes.add(chromosome) + self.nbMappings += 1 + self.size += len(parts[0]) + self.reset() + progress.done() + + + def skipFirstLines(self): + while "------" not in self.handle.readline(): + self.currentLineNb += 1 + pass + + def _computeStarts(self,seqSize,blockSize,start,targetStrand): + if targetStrand == "+": + pass + else: + start = seqSize-blockSize-start + return start + + + + def parseLine(self, line): + m = re.search(r"^\s*(psl:\s+)?(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)\s+([+-]{1,2})\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s*$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a PSL format" % (self.currentLineNb, line)) + + mapping = Mapping() + + queryStrand = m.group(7)[0] + + if len(m.group(7)) == 1: + targetStrand = "+" + else: + targetStrand = m.group(7)[1] + + + for i in range(0, int(m.group(16))): + size = int(m.group(17).split(",")[i]) + queryStart = int(m.group(18).split(",")[i]) + targetStart = int(m.group(19).split(",")[i]) + querySize = int(m.group(9)) + targetSize = int(m.group(13)) + + subMapping = SubMapping() + subMapping.setSize(size) + subMapping.setDirection(m.group(7)[0]) + + queryInterval = Interval() + targetInterval = Interval() + + queryInterval.setName(m.group(8)) + queryStart = self._computeStarts(querySize,size,queryStart,targetStrand) + queryInterval.setStart(queryStart + 1) + queryInterval.setEnd(queryStart + size) + queryInterval.setDirection(queryStrand) + + targetInterval.setChromosome(m.group(12)) + targetStart = self._computeStarts(targetSize,size,targetStart,targetStrand) + targetInterval.setStart(targetStart + 1) + targetInterval.setEnd(targetStart + size) + targetInterval.setDirection(targetStrand) + + subMapping.setQueryInterval(queryInterval) + subMapping.setTargetInterval(targetInterval) + mapping.addSubMapping(subMapping) + + mapping.setSize(int(m.group(2)) + int(m.group(3)) + int(m.group(4))) + mapping.setNbMismatches(int(m.group(3)) + int(m.group(4))) + mapping.setNbGaps(int(m.group(5))) + mapping.setDirection(queryStrand) + + queryInterval = Interval() + targetInterval = Interval() + + queryInterval.setName(m.group(8)) + queryInterval.setStart(min(int(m.group(10)), int(m.group(11)))) + queryInterval.setEnd( max(int(m.group(10)), int(m.group(11)))) + queryInterval.setDirection(queryStrand) + + targetInterval.setChromosome(m.group(12)) + targetInterval.setStart(min(int(m.group(14))+1, int(m.group(15)))) + targetInterval.setEnd( max(int(m.group(14))+1, int(m.group(15)))) + targetInterval.setDirection(targetStrand) + + mapping.setQueryInterval(queryInterval) + mapping.setTargetInterval(targetInterval) + + return mapping + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/README_MultiFasta2SNPFile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/README_MultiFasta2SNPFile Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,66 @@ +*** DESCRIPTION: *** +This program takes as input a multifasta file (with sequences already aligned together formated in fasta in the same file), considers the first sequence as the reference sequence, infers polymorphims and generates output files in GnpSNP exchange format. + + +*** INSTALLATION: *** +Dependancies: +- First you need Python installed in your system. +- Repet libraries are also required. + +*** OPTIONS OF THE LAUNCHER: *** + + -h: this help + +Mandatory options: + -b: Name of the batch of submitted sequences + -g: Name of the gene + -t: Scientific name of the taxon concerned + +Exclusive options (use either the first or the second) + -f: Name of the multifasta input file (for one input file) + -d: Name of the directory containing multifasta input file(s) (for several input files) + + + +*** COMMAND LINE EXAMPLE (for package use): *** +- First, you need to set up the environment variable PYTHONPATH (lo link with the dependancies). + +- Then for one input file (here our example), run: + +python multifastaParserLauncher.py -b Batch_test -g GeneX -t "Arabidopsis thaliana" -f Exemple_multifasta_input.fasta + + +- For several input files, create a directory in the root of the uncompressed package and put your input files in it. Then use this type of command line: + +python multifastaParserLauncher.py -b Batch_test -g GeneX -t "Arabidopsis thaliana" -d + +Each one of the input files will generate a directory with his set of output files. + + +*** SIMPLE USE (for package use): *** +Two executables (one for windows, the other for linux/unix) are in the package. +They show the command lines to use in order to set up environment variables and then to run the parser on our sample input file (Example_multifasta_input.fasta). +You can edit the executable and custom the command line to use it with your own input file. + + +*** BACKLOG (next version) *** +When the launcher is called for several input files (with -d option), the parser should be able to generate only one set of files describing all the batches (one batch per input file). +So below are listed the tasks of the backlog dedicated to this feature: + +- in Multifasta2SNPFile class: + # CONSTRUCTOR: Modify the constructor to add a "several batches" mode called without BatchName and GeneName + # RUNNING METHOD: Add the run_several_batches(directory) method that will browse the input files and iterate over them to run each of them successively (see runSeveralInputFile() method of the launcher) + => 2 days + + # BATCH MANAGEMENT: Modify createBatchDict() to create one batch per file in the dictionary and add a class variable to point toward the current batch (ex: self._iCurrentLineNumber) + # BATCH-LINE MANAGEMENT: Modify _completeBatchLineListWithCurrentIndividual method to allow several batch and link lines to batches (for the moment hard coded batch no1) + # SUBSNP MANAGEMENT: check that all elements (dSUbSNP) added in SubSNP list (lSubSNPFileResults) is linked to the current batch (for the moment hard coded batch no1) + Impacted methods: manageSNPs(), createSubSNPFromAMissingPolym(), addMissingAllelesAndSubSNPsForOnePolym(), mergeAllelesAndSubSNPsFromOverlappingIndels() + => + 2 days + +- in Multifasta2SNPFileWriter class: + # Modify all the method _writeFile (ex: _writeSubSNPFile) to write in append mode and externalize all open and close file + # Create one method to open all the output files and call it in Multifasta2SNPFile run_several_batches method + # Create one method to close all the output files and call it in Multifasta2SNPFile run_several_batches method + + => + 2 days diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/RmapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/RmapParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,76 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping + +class RmapParser(MapperParser): + """A class that parses the output of Rmap format""" + + def __init__(self, fileName, verbosity = 0): + super(RmapParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(RmapParser, self).__del__() + + + def getFileFormats(): + return ["rmap"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+([+-])\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a RMAP format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.targetInterval.setChromosome(m.group(1)) + mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) + mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) + + mapping.queryInterval.setName(m.group(4)) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(mapping.targetInterval.getEnd() - mapping.targetInterval.getStart()) + + mapping.setSize(mapping.targetInterval.getEnd() - mapping.targetInterval.getStart()) + mapping.setNbMismatches(int(m.group(5))) + mapping.setDirection(m.group(6)) + + return mapping + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/SamParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SamParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,234 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Interval import Interval + +class SamParser(MapperParser): + """A class that parses SAM format (as given by BWA)""" + + def __init__(self, fileName, verbosity = 0): + super(SamParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(SamParser, self).__del__() + + + def getFileFormats(): + return ["sam"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on SAM file" + self.reset() + for line in self.handle: + line = line.strip() + if line == "" or line[0] == "@": + continue + parts = line.split("\t") + chromosome = parts[2] + if chromosome != "*": + self.chromosomes.add(chromosome) + self.nbMappings += 1 + self.size += len(parts[8]) + if self.verbosity >= 10 and self.nbMappings % 100000 == 0: + sys.stdout.write(" %d mappings read\r" % (self.nbMappings)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d mappings read" % (self.nbMappings) + print "Done." + + + def parseLine(self, line): + + line = line.strip() + if line[0] == "@": + return + + fields = line.split("\t") + if len(fields) < 11: + raise Exception("Line %d '%s' does not look like a SAM line (number of fields is %d instead of 11)" % (self.currentLineNb, line, len(fields))) + + name = fields[0] + flag = int(fields[1]) + + if (flag & 0x4) == 0x4: + return None + + direction = 1 if (flag & 0x10) == 0x0 else -1 + chromosome = fields[2] + genomeStart = int(fields[3]) + quality = fields[4] + cigar = fields[5] + mate = fields[6] + mateGenomeStart = fields[7] + gapSize = fields[8] + sequence = fields[9] + quality = fields[10] + tags = fields[11:] + + if mateGenomeStart != "*": + mateGenomeStart = int(mateGenomeStart) + + mapping = Mapping() + nbOccurrences = 1 + nbMismatches = 0 + nbMatches = 0 + nbGaps = 0 + subMapping = None + queryOffset = 0 + targetOffset = 0 + currentNumber = 0 + readStart = None + + for tag in tags: + key = tag[:2] + if key == "X0": + nbOccurrences = int(tag[5:]) + elif key == "X1": + nbOccurrences += int(tag[5:]) + elif key == "XM": + nbMismatches = int(tag[5:]) + mapping.setTagValue("nbOccurrences", nbOccurrences) + mapping.setTagValue("quality", int(fields[4])) + + for char in cigar: + m = re.match(r"[0-9]", char) + if m != None: + currentNumber = currentNumber * 10 + (ord(char) - ord("0")) + continue + # match + m = re.match(r"[M]", char) + if m != None: + if readStart == None: + readStart = queryOffset + if subMapping == None: + subMapping = SubMapping() + subMapping.setSize(currentNumber) + subMapping.setDirection(direction) + subMapping.queryInterval.setName(name) + subMapping.queryInterval.setStart(queryOffset) + subMapping.queryInterval.setDirection(direction) + subMapping.targetInterval.setChromosome(chromosome) + subMapping.targetInterval.setStart(genomeStart + targetOffset) + subMapping.targetInterval.setDirection(1) + nbMatches += currentNumber + targetOffset += currentNumber + queryOffset += currentNumber + currentNumber = 0 + continue + # insertion on the read + m = re.match(r"[I]", char) + if m != None: + nbGaps += 1 + queryOffset += currentNumber + currentNumber = 0 + continue + # insertion on the genome + m = re.match(r"[D]", char) + if m != None: + if subMapping != None: + subMapping.queryInterval.setEnd(queryOffset - 1) + subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + subMapping = None + nbGaps += 1 + targetOffset += currentNumber + currentNumber = 0 + continue + # intron + m = re.match(r"[N]", char) + if m != None: + if subMapping != None: + subMapping.queryInterval.setEnd(queryOffset - 1) + subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + subMapping = None + targetOffset += currentNumber + currentNumber = 0 + continue + # soft clipping (substitution) + m = re.match(r"[S]", char) + if m != None: + nbMismatches += currentNumber + targetOffset += currentNumber + queryOffset += currentNumber + currentNumber = 0 + continue + # hard clipping + m = re.match(r"[H]", char) + if m != None: + targetOffset += currentNumber + queryOffset += currentNumber + currentNumber = 0 + continue + # padding + m = re.match(r"[P]", char) + if m != None: + continue + raise Exception("Do not understand paramer '%s' in line %s" % (char, line)) + + if subMapping != None: + subMapping.queryInterval.setEnd(queryOffset - 1) + subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + + mapping.queryInterval.setStart(readStart) + mapping.queryInterval.setEnd(queryOffset - 1) + mapping.targetInterval.setEnd(genomeStart + targetOffset - 1) + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(nbGaps) + + mapping.queryInterval.setName(name) + mapping.queryInterval.setDirection(direction) + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(genomeStart) + mapping.targetInterval.setDirection(direction) + mapping.setSize(len(sequence)) + mapping.setDirection(direction) + + return mapping + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/SeqmapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SeqmapParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,81 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping + + +class SeqmapParser(MapperParser): + """A class that parses the output of SeqMap""" + + def __init__(self, fileName, verbosity = 0): + super(SeqmapParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(SeqmapParser, self).__del__() + + + def getFileFormats(): + return ["seqmap"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + self.startingPoint = self.handle.tell() + self.currentLineNb += 1 + if "trans_id" not in self.handle.readline(): + self.currentLineNb -= 1 + self.handle.seek(self.startingPoint) + self.startingPoint = self.handle.tell() + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\w+)\t+([^\t]+)\t+(\w+)\t+(\d+)\t+([+-])\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a SeqMap format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.targetInterval.setChromosome(m.group(1)) + mapping.targetInterval.setStart(int(m.group(2))) + mapping.targetInterval.setSize(len(m.group(3))) + + mapping.queryInterval.setName(m.group(4)) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(len(m.group(3))) + + mapping.setSize(len(m.group(3))) + mapping.setNbMismatches(int(m.group(6))) + mapping.setDirection(m.group(7)) + + return mapping + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/SequenceListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SequenceListParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,228 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.SequenceList import SequenceList +from SMART.Java.Python.misc.Progress import Progress + +class SequenceListParser(object): + """ + A virtual class that reads a list of sequences + @ivar verbosity: verbosity + @type verbosity: int + @ivar fileName: name of the file to parse + @type fileName: string + @ivar handle: file to parse + @type handle: file + @ivar nbSequences: number of sequences in the file + @type nbSequences: int + @ivar nbReadSequences: number of sequences read + @type nbReadSequences: int + @ivar currentLine: line currently read + @type currentLine: string + @ivar size: total number of nucleotides in the sequences + @type size: int + @ivar sizes: number of nucleotides per sequences + @type sizes: dict of string to int + """ + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + @param fileName: name of the file to parse + @type fileName: string + """ + self.verbosity = verbosity + self.fileName = fileName + self.nbSequences = None + self.nbReadSequences = 0 + self.currentLine = None + self.size = None + self.sizes = None + try: + self.handle = open(self.fileName, "rb") + except IOError: + raise Exception("Error! Sequence file '%s' does not exist! Exiting..." % (self.fileName)) + + + def __del__(self): + """ + Destructor + """ + if not self.handle.closed: + self.handle.close() + + + def close(self): + """ + Close file handle + """ + self.handle.close() + + + def reset(self): + """ + Prepare the file to be read again from start + """ + self.handle.seek(0) + self.currentLine = None + self.nbReadSequences = 0 + + + def getFileFormats(self): + pass + getFileFormats = staticmethod(getFileFormats) + + + def parse(self): + """ + Parse the whole file in one shot + @return: a list of sequence + """ + sequenceList = SequenceList() + progress = Progress(self.getNbSequences(), "Reading %s" % (self.fileName), self.verbosity) + for sequence in self.getIterator(): + sequenceList.addSequence(sequence) + progress.inc() + progress.done() + return sequenceList + + + def getIterator(self): + """ + Iterate on the file, sequence by sequence + @return: an iterator to sequences + """ + self.reset() + sequence = self.parseOne() + while sequence != None: + self.nbReadSequences += 1 + yield sequence + sequence = self.parseOne() + + + def getInfos(self): + """ + Get some generic information about the sequences + """ + self.nbSequences = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + for sequence in self.getIterator(): + self.nbSequences += 1 + self.size += sequence.getSize() + if self.verbosity >= 10 and self.nbSequences % 100000 == 0: + sys.stdout.write(" %d sequences read\r" % (self.nbSequences)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d sequences read" % (self.nbSequences) + print "Done." + + + def getNbSequences(self): + """ + Get the number of sequences in the file + @return: the number of sequences + """ + if self.nbSequences != None: + return self.nbSequences + self.getInfos() + return self.nbSequences + + + def getNbItems(self): + """ + Get the number of sequences in the file + @return: the number of sequences + """ + return self.getNbSequences() + + + def getSize(self): + """ + Get the size of all the sequences + @return: the size + """ + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getRegions(self): + """ + Get the names of the sequences + @return: the names + """ + if self.sizes != None: + return self.sizes.keys() + + self.sizes = {} + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + self.nbSequences = 0 + for sequence in self.getIterator(): + self.sizes[sequence.name] = sequence.getSize() + self.nbSequences += 1 + if self.verbosity >= 10 and self.nbSequences % 100000 == 0: + sys.stdout.write(" %d sequences read\r" % (self.nbSequences)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d sequences read" % (self.nbSequences) + print "Done." + return self.sizes.keys() + + + def getSizeOfRegion(self, region): + """ + Get the size of a sequence + @param region: the name of the sequence + @type region: string + @return: the size of the sequence + """ + if self.sizes != None: + if region not in self.sizes: + raise Exception("Region %s is not found" % region) + return self.sizes[region] + + self.getRegions() + if region not in self.sizes: + raise Exception("Region %s is not found" % region) + + def __eq__(self, o): + if o == None: + return False + return self.fileName == o.fileName and self.nbSequences == o.nbSequences diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/ShrimpParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ShrimpParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,107 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser + + +class ShrimpParser(MapperParser): + """A class that parses the output of Shrimp""" + + def __init__(self, fileName, verbosity = 0): + super(ShrimpParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(ShrimpParser, self).__del__() + + + def getFileFormats(): + return ["shrimp"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + self.handle.readline() + self.currentLineNb += 1 + + + def parseLine(self, line): + m = re.search(r"^\s*>([^\t]+)\t+(\S+)\s+([+-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a Shrimp format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.queryInterval.setName(m.group(1)) + mapping.queryInterval.setStart(min(int(m.group(6)), int(m.group(7)))) + mapping.queryInterval.setEnd(max(int(m.group(6)), int(m.group(7)))) + + mapping.targetInterval.setChromosome(m.group(2)) + mapping.targetInterval.setStart(min(int(m.group(4)), int(m.group(5)))) + mapping.targetInterval.setEnd(max(int(m.group(4)), int(m.group(5)))) + + mapping.setSize(int(m.group(8))) + mapping.setDirection(m.group(3)) + + editString = m.group(10) + nbMismatches = 0 + nbGaps = 0 + while editString != "": + m = re.search(r"^(\d+)(\D.*)$", editString) + if m != None: + editString = m.group(2) + else: + m = re.search(r"^(\d+)$", editString) + if m != None: + editString = "" + else: + m = re.search(r"^([A-Z])(.*)$", editString) + if m != None: + nbMismatches += 1 + editString = m.group(2) + else: + m = re.search(r"^\((\w+)\)(.*)$", editString) + if m != None: + nbGaps += len(m.group(1)) + editString = m.group(2) + else: + m = re.search(r"^-(.*)$", editString) + if m != None: + nbGaps += 1 + editString = m.group(1) + else: + sys.exit("Cannot understand edit string %s from line %s" % (editString, line)) + + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(nbGaps) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/Soap2Parser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/Soap2Parser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,148 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from commons.core.parsing.MapperParser import MapperParser + + +def mappingToSubMapping(mapping): + subMapping = SubMapping() + subMapping.targetInterval.copy(mapping.targetInterval) + subMapping.queryInterval.copy(mapping.queryInterval) + subMapping.setDirection(mapping.getDirection()) + subMapping.size = mapping.size + subMapping.tags = mapping.tags + return subMapping + + + +class Soap2Parser(MapperParser): + """A class that parses the output of SOAP2""" + + def __init__(self, fileName, verbosity = 0): + super(Soap2Parser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(Soap2Parser, self).__del__() + + + def getFileFormats(): + return ["soap2"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getIterator(self): + self.reset() + currentName = None + currentMappings = [] + for line in self.handle: + mapping = self.parseLine(line) + name = mapping.queryInterval.name + if name == currentName: + if mapping.getTagValue("end") == "a": + currentMappings.append(mapping) + else: + otherEndMapping = currentMappings.pop(0) + + newMapping = Mapping() + subMappingA = mappingToSubMapping(otherEndMapping) + subMappingB = mappingToSubMapping(mapping) + subMappingB.queryInterval.setDirection(subMappingA.queryInterval.getDirection()) + + newMapping.addSubMapping(subMappingA) + newMapping.addSubMapping(subMappingB) + + newMapping.tags = otherEndMapping.tags + newMapping.setSize(otherEndMapping.size + mapping.size) + newMapping.setNbMismatches(otherEndMapping.getTagValue("nbMismatches") + mapping.getTagValue("nbMismatches")) + print otherEndMapping.getTagValue("nbMismatches") + print mapping.getTagValue("nbMismatches") + print newMapping.getTagValue("nbMismatches") + sys.exit() + newMapping.setTagValue("qualityString", otherEndMapping.getTagValue("qualityString") + mapping.getTagValue("qualityString")) + newMapping.setTagValue("occurrence", "%d" % (newMapping.getTagValue("nbOccurrences") - len(currentMappings))) + newMapping.setTagValue("ID", "%s-%s" % (name, newMapping.getTagValue("occurrence"))) + del newMapping.tags["end"] + yield newMapping + else: + currentName = mapping.queryInterval.name + for currentMapping in currentMappings: + yield currentMapping + currentMappings = [mapping] + self.currentLineNb += 1 + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\w+)\s+(\S+)\s+(\d+)\s+([ab])\s+(\d+)\s+([+-])\s+(\w+)\s+(\d+)\s+(\d+)\s+", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a SOAP2 format" % (self.currentLineNb, line)) + + name = m.group(1) + read = m.group(2) + qualityString = m.group(3) + nbOccurrences = int(m.group(4)) + end = m.group(5) + size = int(m.group(6)) + direction = m.group(7) + chromosome = m.group(8) + genomeStart = int(m.group(9)) + nbMismatches = int(m.group(10)) + + mapping = Mapping() + if name.endswith("/1") or name.endswith("/2"): + name = name[:-2] + + mapping.queryInterval.name = name + mapping.queryInterval.setDirection(direction) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setEnd(size) + + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(genomeStart) + mapping.targetInterval.setSize(size) + + mapping.setDirection(direction) + mapping.setSize(size) + + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(0) + mapping.setTagValue("qualityString", qualityString) + mapping.setTagValue("nbOccurrences", nbOccurrences) + mapping.setTagValue("end", end) + + return mapping + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/SoapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SoapParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,75 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser + + +class SoapParser(MapperParser): + """A class that parses the output of SOAP""" + + def __init__(self, fileName, verbosity = 0): + super(SoapParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(SoapParser, self).__del__() + + + def getFileFormats(): + return ["soap"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(a)\s+(\d+)\s+([+-])\s+(\w+)\s+(\d+)\s+(\d+)", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a SOAP format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.queryInterval.setName(m.group(1)) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(len(m.group(2))) + + mapping.targetInterval.setChromosome(m.group(8)) + mapping.targetInterval.setStart(int(m.group(9))) + mapping.targetInterval.setSize(len(m.group(2))) + + mapping.setDirection(m.group(7)) + mapping.setSize(len(m.group(2))) + mapping.setNbMismatches(int(m.group(10))) + + return mapping diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/SsrParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SsrParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,170 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import sys + +## this class can parse a Ssr results output file. SSR.pl is developped by S.Cartinhour. (5/2000) +# +class SsrParser(object): + + + def __init__(self, BES_name='', BES_redundancy='', SSR_nbNucleotides='', SSR_Motif='', SSR_Motif_number='', SSR_start='', SSR_end='', BES_size=''): + self._BesName = BES_name + self._BesRedundancy = BES_redundancy + self._SsrNbNucleotides = SSR_nbNucleotides + self._SsrMotif = SSR_Motif + self._SsrMotifNumber = SSR_Motif_number + self._SsrStart = SSR_start + self._SsrEnd = SSR_end + self._BesSize = BES_size + + def __eq__(self, o): + return self._BesName == o._BesName and self._BesRedundancy == o._BesRedundancy and self._SsrNbNucleotides == o._SsrNbNucleotides and self._SsrMotif == o._SsrMotif and self._SsrMotifNumber == o._SsrMotifNumber and self._SsrStart == o._SsrStart and self._SsrEnd == o._SsrEnd and self._BesSize == o._BesSize + + def setBesName(self, BES_Name): + self._BesName = BES_Name + + def setBesRedundancy(self, BES_redundancy): + self._BesRedundancy = BES_redundancy + + def setSsrNbNucleotides(self, SSR_nbNucleotides): + self._SsrNbNucleotides = SSR_nbNucleotides + + def setSsrMotif(self, SSR_Motif): + self._SsrMotif = SSR_Motif + + def setSsrMotifNumber(self, SSR_Motif_number): + self._SsrMotifNumber = SSR_Motif_number + + def setSsrStart(self, SSR_start): + self._SsrStart = SSR_start + + def setSsrEnd(self, SSR_end): + self._SsrEnd = SSR_end + + def setBesSize(self, BES_size): + self._BesSize = BES_size + + def getBesName(self): + return self._BesName + + def getBesRedundancy(self): + return self._BesRedundancy + + def getSsrNbNucleotides(self): + return self._SsrNbNucleotides + + def getSsrMotif(self): + return self._SsrMotif + + def getSsrMotifNumber(self): + return self._SsrMotifNumber + + def getSsrStart(self): + return self._SsrStart + + def getSsrEnd(self): + return self._SsrEnd + + def getBesSize(self): + return self._BesSize + + def setAttributes(self, lResults, iCurrentLineNumber): + error = False + + if lResults[0] != '': + self.setBesName(lResults[0]) + else: + sys.stderr.write("WARNING: The field BES Name is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[1] != '': + self.setBesRedundancy(lResults[1]) + else: + sys.stderr.write("WARNING: The field BES Redundancy is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[2] != '': + self.setSsrNbNucleotides(lResults[2]) + else: + sys.stderr.write("WARNING: The field SSR Number Nucleotides is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[3] != '': + self.setSsrMotif(lResults[3]) + else: + sys.stderr.write("WARNING: The field SSR Motif is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[4] != '': + self.setSsrMotifNumber(lResults[4]) + else: + sys.stderr.write("WARNING: The field SSR Motif Number is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[5] != '': + self.setSsrStart(lResults[5]) + else: + sys.stderr.write("WARNING: The field SSR Start is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[6] != '': + self.setSsrEnd(lResults[6]) + else: + sys.stderr.write("WARNING: The field SSR End is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[7] != '': + self.setBesSize(lResults[7]) + else: + sys.stderr.write("WARNING: The field BES Size is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if error == True: + self._setAllToNull() + + def setAttributesFromString(self, ssrLine, iCurrentLineNumber ="", fieldSeparator ="\t"): + ssrLine = ssrLine.rstrip() + lSsrLineItem = ssrLine.split(fieldSeparator) + if len(lSsrLineItem) < 8: + sys.stderr.write("WARNING: The line %s is not a valid SSR Result line\n" % iCurrentLineNumber) + else: + self.setAttributes(lSsrLineItem, iCurrentLineNumber) + + def _setAllToNull(self): + self._BesName = '' + self._BesRedundancy = '' + self._SsrNbNucleotides = '' + self._SsrMotif = '' + self._SsrMotifNumber = '' + self._SsrStart = '' + self._SsrEnd = '' + self._BesSize = '' + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/TranscriptListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/TranscriptListParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,182 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class TranscriptListParser(object): + """A (quite generic) class that reads a list of transcripts""" + + def __init__(self, fileName, verbosity = 0): + self.verbosity = verbosity + self.fileName = fileName + self.nbTranscripts = None + self.size = None + self.chromosomes = None + self.currentTranscript = None + self.currentLineNb = 0 + self.previousTranscriptAddress = None + try: + self.handle = open(self.fileName) + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + self.skipFirstLines() + + + def __del__(self): + self.close() + + + def getFileFormats(): + pass + getFileFormats = staticmethod(getFileFormats) + + + def close(self): + if self.handle != None and not self.handle.close: + self.handle.close() + self.handle = None + + + def reset(self): + self.handle.seek(0) + self.skipFirstLines() + self.currentTranscript = None + self.currentLineNb = 0 + self.currentTranscriptAddress = self.handle.tell() + self.currentAddress = self.handle.tell() + + + def gotoAddress(self, address): + self.reset() + self.handle.seek(address) + self.currentTranscriptAddress = address + self.currentAddress = address + + + def parse(self): + transcriptList = TranscriptList() + progress = Progress(self.getNbTranscripts(), "Reading %s" % (self.fileName), self.verbosity) + for line in self.handle: + self.currentLineNb += 1 + transcript = self.parseLine(line) + transcriptList.addTranscript(transcript) + progress.inc() + progress.done() + return transcriptList + + + def getIterator(self): + self.reset() + transcript = self.getNextTranscript() + while transcript != None: + yield transcript + transcript = self.getNextTranscript() + + + def getCurrentAddress(self): + return self.currentAddress + + + def getCurrentTranscriptAddress(self): + return self.currentTranscriptAddress + + + def getNextTranscript(self): + self.currentAddress = self.handle.tell() + line = self.handle.readline() + while line != "": + line = line.strip() + self.currentLineNb += 1 + transcript = self.parseLine(line) + if transcript != None: + return transcript + self.currentAddress = self.handle.tell() + line = self.handle.readline() + transcript = self.currentTranscript + self.currentTranscriptAddress = self.previousTranscriptAddress + self.currentTranscript = None + return transcript + + + def getInfos(self): + self.chromosomes = set() + self.nbTranscripts = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting information on %s." % (self.fileName), self.verbosity-9) + transcript = self.getNextTranscript() + for transcript in self.getIterator(): + self.chromosomes.add(transcript.getChromosome()) + self.nbTranscripts += 1 + self.size += transcript.getSize() + progress.inc() + progress.done() + self.reset() + + + def getNbTranscripts(self): + if self.nbTranscripts != None: + return self.nbTranscripts + self.getInfos() + return self.nbTranscripts + + + def getNbItems(self): + return self.getNbTranscripts() + + + def getChromosomes(self): + if self.chromosomes != None: + return self.chromosomes + self.getInfos() + return self.chromosomes + + + def getSize(self): + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getNbNucleotides(self): + return self.getSize() + + + def setDefaultTagValue(self, name, value): + for transcript in self.getIterator(): + transcript.setTag(name, value) + + def __eq__(self, o): + if o == None: + return False + return self.fileName == o.fileName and self.nbTranscripts == o.nbTranscripts and self.size == o.size and self.chromosomes == o.chromosomes diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanFile.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,145 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.parsing.VarscanHit import VarscanHit +from commons.core.parsing.VarscanHit_WithTag import VarscanHit_WithTag +from commons.core.parsing.VarscanHit_v2_2_8 import VarscanHit_v2_2_8 +from commons.core.checker.CheckerException import CheckerException +from commons.core.parsing.VarscanHit_v2_2_8_WithTag import VarscanHit_v2_2_8_WithTag + +class VarscanFile(object): + + def __init__(self, varscanFileName = ""): + self._varscanFileName = varscanFileName + self._varscanFieldSeparator = "\t" + self._lVarscanHits = [] + self._typeOfVarscanFile = "" + + def __eq__(self, o): + return self._varscanFieldSeparator == o._varscanFieldSeparator and self._lVarscanHits == o._lVarscanHits and self._varscanFileName == o._varscanFileName + + def setVarscanHitsList(self, lVarscanHits): + self._lVarscanHits = lVarscanHits + + def setHeaderVarcanFile(self, headerVarcanFile): + self._headerVarcanFile = headerVarcanFile + + def setTypeOfVarscanFile(self, type): + if type == "Varscan_2_2" or type == "Varscan_2_2_WithTag" or type == "Varscan_2_2_8" or type == "Varscan_2_2_8_WithTag": + self._typeOfVarscanFile = type + else: + self._typeOfVarscanFile = "" + + def getVarscanHitsList(self): + return self._lVarscanHits + + def getHeaderVarcanFile(self): + return self._headerVarcanFile + + def getListOfVarscanHits(self): + return self._lVarscanHits + + def getTypeOfVarscanFile(self): + return self._typeOfVarscanFile + + def parse(self): + varscanFile = open(self._varscanFileName, "r") + currentLineNumber = 0 + line = varscanFile.readline() + if "Chrom\tPosition" in line: + self.setHeaderVarcanFile(line) + line = varscanFile.readline() + while line != "": + if not "Chrom\tPosition" in line: + currentLineNumber += 1 + line = line.strip() + lResults = line.split(self._varscanFieldSeparator) + if len(lResults) == 12: + currentVarscanLine = self.createVarscanHit(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2" + elif len(lResults) == 13: + currentVarscanLine = self.createVarscanHitWithTag(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2_WithTag" + elif len(lResults) == 19: + currentVarscanLine = self.createVarscanHit_v2_2_8(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2_8" + elif len(lResults) == 20: + currentVarscanLine = self.createVarscanHit_v2_2_8_WithTag(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2_8_WithTag" + else: + raise CheckerException ("Warning: this line (l.%s) is not a valid varscan line !" % currentLineNumber) + self._lVarscanHits.append(currentVarscanLine) + line = varscanFile.readline() + varscanFile.close() + + def createVarscanObjectFromLine(self, line, currentLineNumber): + if self._typeOfVarscanFile == "Varscan_2_2": + VarscanHit = self.createVarscanHit(line, currentLineNumber) + return VarscanHit + elif self._typeOfVarscanFile == "Varscan_2_2_WithTag": + return self.createVarscanHitWithTag(line, currentLineNumber) + elif self._typeOfVarscanFile == "Varscan_2_2_8": + return self.createVarscanHit_v2_2_8(line, currentLineNumber) + elif self._typeOfVarscanFile == "Varscan_2_2_8_WithTag": + return self.createVarscanHit_v2_2_8_WithTag(line, currentLineNumber) + + def createVarscanHit(self, line, currentLineNumber): + iVarscanHit = VarscanHit() + iVarscanHit.setAttributesFromString(line, currentLineNumber) + return iVarscanHit + + def createVarscanHitWithTag(self, line, currentLineNumber): + iVarscanHitWithTag = VarscanHit_WithTag() + iVarscanHitWithTag.setAttributesFromString(line, currentLineNumber) + return iVarscanHitWithTag + + def createVarscanHit_v2_2_8(self, line, currentLineNumber): + iVarscanHit = VarscanHit_v2_2_8() + iVarscanHit.setAttributesFromString(line, currentLineNumber) + return iVarscanHit + + def createVarscanHit_v2_2_8_WithTag(self, line, currentLineNumber): + iVarscanHitWithTag = VarscanHit_v2_2_8_WithTag() + iVarscanHitWithTag.setAttributesFromString(line, currentLineNumber) + return iVarscanHitWithTag + + def selectTypeOfVarscanHitObject(self): + if self._typeOfVarscanFile == "": + raise CheckerException ("Error: no varscan object found !") + elif self._typeOfVarscanFile == "Varscan_2_2": + return VarscanHit() + elif self._typeOfVarscanFile == "Varscan_2_2_WithTag": + return VarscanHit_WithTag() + elif self._typeOfVarscanFile == "Varscan_2_2_8": + return VarscanHit_v2_2_8() + elif self._typeOfVarscanFile == "Varscan_2_2_8_WithTag": + return VarscanHit_v2_2_8_WithTag() + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanFileForGnpSNP.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanFileForGnpSNP.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,72 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.parsing.VarscanHitForGnpSNP import VarscanHitForGnpSNP +from commons.core.parsing.VarscanFile import VarscanFile + +class VarscanFileForGnpSNP(VarscanFile): + + def __init__(self, varscanFileName, fastqFileName="", refFastaFileName="", taxonName=""): + VarscanFile.__init__(self, varscanFileName) + self._fastqFileName = fastqFileName + self._refFastaFileName = refFastaFileName + self._taxonName = taxonName + self._previousVarscanHit = None + + ## Equal operator + # + # @param o a VarscanFileAnalysis instance + # + def __eq__(self, o): + return VarscanFile.__eq__(self, o) and self._fastqFileName == o._fastqFileName \ + and self._refFastaFileName == o._refFastaFileName and self._taxonName == o._taxonName + + def getVarscanFieldSeparator(self): + return self._varscanFieldSeparator + + def getFastqFileName(self): + return self._fastqFileName + + def getRefFastaFileName(self): + return self._refFastaFileName + + def getTaxonName(self): + return self._taxonName + + def createVarscanHit(self, line, currentLineNumber): + line = line.strip() + lResults = line.split(self._varscanFieldSeparator) + iVarscanHit = VarscanHitForGnpSNP() + iVarscanHit.setAttributes(lResults, currentLineNumber) + iVarscanHit.formatAlleles2GnpSnp() + iVarscanHit.manageOccurrence(self._previousVarscanHit) + self._previousVarscanHit = iVarscanHit + return iVarscanHit diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanHit.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,175 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.CheckerException import CheckerException + +class VarscanHit(object): + + def __init__(self, chrom = "", position = "", ref = "", var = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = ""): + self._chrom = chrom + self._position = position + self._ref = ref + self._var = var + self._readsRef = readsRef + self._readsVar = readsVar + self._varFreq = varFreq + self._strandsRef = strandsRef + self._strandsVar = strandsVar + self._qualRef = qualRef + self._qualVar = qualVar + self._pValue = pValue + + ## Equal operator + # + # @param o a VarscanFileAnalysis instance + # + def __eq__(self, o): + return self._chrom == o._chrom and self._position == o._position and self._ref == o._ref and self._var == o._var + + def setChrom(self, chromosome): + self._chrom = chromosome + + def setPosition(self, position): + self._position = position + + def setRef(self, referenceAllele): + self._ref = referenceAllele + + def setVar(self, variantAllele): + self._var = variantAllele + + def setReadsRef(self, readsRef): + self._readsRef = readsRef + + def setReadsVar(self, readsVar): + self._readsVar = readsVar + + def setVarFreq(self, varFreq): + self._varFreq = varFreq + + def setStrandsRef(self, strandsRef): + self._strandsRef = strandsRef + + def setStrandsVar(self, strandsVar): + self._strandsVar = strandsVar + + def setQualRef(self, qualRef): + self._qualRef = qualRef + + def setQualVar(self, qualVar): + self._qualVar = qualVar + + def setPValue(self, pValue): + self._pValue = pValue + + def getChrom(self): + return self._chrom + + def getPosition(self): + return self._position + + def getRef(self): + return self._ref + + def getVar(self): + return self._var + + def getReadsRef(self): + return self._readsRef + + def getReadsVar(self): + return self._readsVar + + def getVarFreq(self): + return self._varFreq + + def getStrandsRef(self): + return self._strandsRef + + def getStrandsVar(self): + return self._strandsVar + + def getQualRef(self): + return self._qualRef + + def getQualVar(self): + return self._qualVar + + def getPValue(self): + return self._pValue + + def getHeader(self): + return "Chrom\tPosition\tRef\tVar\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getVar(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue()) + + def setAttributes(self, lResults, iCurrentLineNumber): + if lResults[0] != '': + self.setChrom(lResults[0]) + else: + raise CheckerException ("The field Chrom is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[1] != '': + self.setPosition(lResults[1]) + else: + raise CheckerException ("The field Position is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[2] != '': + self.setRef(lResults[2]) + else: + raise CheckerException ("The field Ref is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[3] != '': + self.setVar(lResults[3]) + else: + raise CheckerException ("The field Var is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[4] != '': + self.setReadsRef(lResults[4]) + if lResults[5] != '': + self.setReadsVar(lResults[5]) + if lResults[6] != '': + self.setVarFreq(lResults[6]) + if lResults[7] != '': + self.setStrandsRef(lResults[7]) + if lResults[8] != '': + self.setStrandsVar(lResults[8]) + if lResults[9] != '': + self.setQualRef(lResults[9]) + if lResults[10] != '': + self.setQualVar(lResults[10]) + if lResults[11] != '': + self.setPValue(lResults[11]) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem)<12: + for i in range(len(lvarscanStringItem), 12): + lvarscanStringItem.append ("") + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanHitForGnpSNP.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHitForGnpSNP.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,232 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.CheckerException import CheckerException +from commons.core.parsing.VarscanHit import VarscanHit +import re + +class VarscanHitForGnpSNP(VarscanHit): + + def __init__(self): + VarscanHit.__init__(self) + self._reads1 = '' + self._reads2 = '' + self._varFreq = '' + self._strands1 = '' + self._strands2 = '' + self._qual1 = '' + self._qual2 = '' + self._pvalue = '' + self._5flank = '' + self._3flank = '' + self._gnpSnp_ref = '' + self._gnpSnp_var = '' + self._gnpSnp_position = 0 + self._polymType = '' + self._polymLength = 0 + self._occurrence = 1 + + ## Equal operator + # + # @param o a VarscanFileAnalysis instance + # + def __eq__(self, o): + return VarscanHit.__eq__(self, o) \ + and self._reads1 == o._reads1 and self._reads2 == o._reads2 \ + and self._varFreq == o._varFreq and self._strands1 == o._strands1 \ + and self._strands2 == o._strands2 and self._qual1 == o._qual1 \ + and self._qual2 == o._qual2 and self._pvalue == o._pvalue \ + and self._3flank == o._3flank and self._5flank == o._5flank \ + and self._gnpSnp_position == o._gnpSnp_position and self._gnpSnp_ref == o._gnpSnp_ref \ + and self._gnpSnp_var == o._gnpSnp_var and self._polymLength == o._polymLength \ + and self._polymType == o._polymType and self._occurrence == o._occurrence + + def isPolymTypeAlreadyFoundAtThisChromAndThisPosition(self, iVarscanHitForGnpSNP): + return self._chrom == iVarscanHitForGnpSNP.getChrom() \ + and self._position == iVarscanHitForGnpSNP.getPosition() \ + and self._polymType == iVarscanHitForGnpSNP.getPolymType() + + def manageOccurrence(self, iVarscanHitForGnpSNP=None): + if iVarscanHitForGnpSNP != None and self.isPolymTypeAlreadyFoundAtThisChromAndThisPosition(iVarscanHitForGnpSNP): + self._occurrence = iVarscanHitForGnpSNP.getOccurrence() + 1 + + def formatAlleles2GnpSnp(self): + if self.getVar().find("-") != -1: + self._polymType = "DELETION" + self._gnpSnp_position = int(self._position) + 1 + self._gnpSnp_ref = self._var[1:] + self._gnpSnp_var = "-" * len(self._gnpSnp_ref) + self._polymLength = len(self._gnpSnp_ref) + elif self.getVar().find("+") != -1: + self._polymType = "INSERTION" + self._gnpSnp_position = int(self._position) + self._gnpSnp_var = self._var[1:] + self._gnpSnp_ref = "-" * len(self._gnpSnp_var) + self._polymLength = 1 + else: + self._polymType = "SNP" + self._gnpSnp_position = int(self._position) + self._gnpSnp_var = self._var + self._gnpSnp_ref = self._ref + self._polymLength = 1 + + def setReads1(self, nbReadsLikeRef): + self._reads1 = nbReadsLikeRef + + def setReads2(self, nbReadsLikeVar): + self._reads2 = nbReadsLikeVar + + def setVarFreq(self, frequencyOfVariantAllele): + frequencyOfVariantAllele = frequencyOfVariantAllele.replace("%","") + frequencyOfVariantAllele = frequencyOfVariantAllele.replace(",",".") + self._varFreq = float(frequencyOfVariantAllele) + + def setStrands1(self, strandsOfReferenceAllele): + self._strands1 = strandsOfReferenceAllele + + def setStrands2(self, strandsOfVariantAllele): + self._strands2 = strandsOfVariantAllele + + def setQual1(self, averageQualityOfRef): + self._qual1 = averageQualityOfRef + + def setQual2(self, averageQualityOfVar): + self._qual2 = averageQualityOfVar + + def setPvalue(self, pvalue): + self._pvalue = pvalue + + def set5flank(self, s5flank): + self._5flank = s5flank + + def set3flank(self, s3flank): + self._3flank = s3flank + + def setGnpSNPRef(self, ref): + self._gnpSnp_ref = ref + + def setGnpSNPVar(self, var): + self._gnpSnp_var = var + + def setGnpSNPPosition(self, position): + self._gnpSnp_position = position + + def setOccurrence(self, occurrence): + self._occurrence = occurrence + + def setPolymType(self, polymType): + self._polymType = polymType + + def setPolymLength(self, polymLength): + self._polymLength = polymLength + + def getReads1(self): + return self._reads1 + + def getReads2(self): + return self._reads2 + + def getVarFreq(self): + return self._varFreq + + def getStrands1(self): + return self._strands1 + + def getStrands2(self): + return self._strands2 + + def getQual1(self): + return self._qual1 + + def getQual2(self): + return self._qual2 + + def getPvalue(self): + return self._pvalue + + def get5flank(self): + return self._5flank + + def get3flank(self): + return self._3flank + + def getPolymType(self): + return self._polymType + + def getGnpSnpVar(self): + return self._gnpSnp_var + + def getGnpSnpRef(self): + return self._gnpSnp_ref + + def getGnpSnpPosition(self): + return self._gnpSnp_position + + def getPolymLength(self): + return self._polymLength + + def getOccurrence(self): + return self._occurrence + + def setAttributes(self, lResults, iCurrentLineNumber): + VarscanHit.setAttributes(self, lResults, iCurrentLineNumber) + if lResults[4] != '': + self.setReads1(lResults[4]) + else: + raise CheckerException ("The field Reads1 is empty in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[5] != '': + self.setReads2(lResults[5]) + else: + raise CheckerException ("The field Reads2 is empty in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[6] != '' and re.match("[0-9\,\%]+", lResults[6]): + self.setVarFreq(lResults[6]) + else: + raise CheckerException ("The field VarFreq is empty or in bad format in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[7] != '': + self.setStrands1(lResults[7]) + else: + raise CheckerException ("The field Strands1 is empty in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[8] != '': + self.setStrands2(lResults[8]) + else: + raise CheckerException ("The field Strands2 is empty in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[9] != '': + self.setQual1(lResults[9]) + else: + raise CheckerException ("The field Qual1 is empty in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[10] != '': + self.setQual2(lResults[10]) + else: + raise CheckerException ("The field Qual2 is empty in varscan file in line %s" % (iCurrentLineNumber)) + if lResults[11] != '': + self.setPvalue(lResults[11]) + else: + raise CheckerException ("The field Pvalue is empty in varscan file in line %s" % (iCurrentLineNumber)) \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanHit_WithTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit_WithTag.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,70 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + + +from commons.core.parsing.VarscanHit import VarscanHit + +class VarscanHit_WithTag(VarscanHit): + + def __init__(self, tag = "", chrom = "", position = "", ref = "", var = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = ""): + self._tag = tag + VarscanHit.__init__(self, chrom, position, ref, var, readsRef, readsVar, varFreq, strandsRef, strandsVar, qualRef, qualVar, pValue) + + def __eq__(self, o): + if self._tag == o._tag: + return VarscanHit.__eq__(self, o) + return False + + def setTag(self, tag): + self._tag = tag + + def getTag(self): + return self._tag + + def getHeader(self): + return "Chrom\tPosition\tRef\tVar\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\tTag\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getVar(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue(), self.getTag()) + + def setAttributes(self, lResults, iCurrentLineNumber): + VarscanHit.setAttributes(self, lResults, iCurrentLineNumber) + if lResults[12] != '': + self.setTag(lResults[12]) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem)<13: + for i in range(len(lvarscanStringItem), 13): + lvarscanStringItem.append ("") + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanHit_v2_2_8.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit_v2_2_8.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,176 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.CheckerException import CheckerException +from commons.core.parsing.VarscanHit import VarscanHit + +class VarscanHit_v2_2_8(VarscanHit): + + def __init__(self, chrom = "", position = "", ref = "", cns = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = "", mapQualRef = "", mapQualVar = "", readsRefPlus = "", readsRefMinus = "", readsVarPlus = "", readsVarMinus = "", var = ""): + self._cns = cns + self._mapQualRef = mapQualRef + self._mapQualVar = mapQualVar + self._readsRefPlus = readsRefPlus + self._readsRefMinus = readsRefMinus + self._readsVarPlus = readsVarPlus + self._readsVarMinus = readsVarMinus + VarscanHit.__init__(self, chrom, position, ref, var, readsRef, readsVar, varFreq, strandsRef, strandsVar, qualRef, qualVar, pValue) + + ## Equal operator + # + # @param o a VarscanFileAnalysis instance + # + def __eq__(self, o): + if self._cns == o._cns: + return VarscanHit.__eq__(self, o) + return False + + def setCns(self, consensus): + self._cns = consensus + + def setMapQualRef(self, mapQualRef): + self._mapQualRef = mapQualRef + + def setMapQualVar(self, mapQualVar): + self._mapQualVar = mapQualVar + + def setReadsRefPlus(self, readsRefPlus): + self._readsRefPlus = readsRefPlus + + def setReadsRefMinus(self, readsRefMinus): + self._readsRefMinus = readsRefMinus + + def setReadsVarPlus(self, readsVarPlus): + self._readsVarPlus = readsVarPlus + + def setReadsVarMinus(self, readsVarMinus): + self._readsVarMinus = readsVarMinus + + def getCns(self): + return self._cns + + def getMapQualRef(self): + return self._mapQualRef + + def getMapQualVar(self): + return self._mapQualVar + + def getReadsRefPlus(self): + return self._readsRefPlus + + def getReadsRefMinus(self): + return self._readsRefMinus + + def getReadsVarPlus(self): + return self._readsVarPlus + + def getReadsVarMinus(self): + return self._readsVarMinus + + def getHeader(self): + return "Chrom\tPosition\tRef\tCons\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\tMapQual1\tMapQual2\tReads1Plus\tReads1Minus\tReads2Plus\tReads2Minus\tVarAllele\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getCns(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue(), self.getMapQualRef(), self.getMapQualVar(), self.getReadsRefPlus(), self.getReadsRefMinus(), self.getReadsVarPlus(), self.getReadsVarMinus(), self.getVar()) + + def setAttributes(self, lResults, iCurrentLineNumber): + if lResults[0] != '': + self.setChrom(lResults[0]) + else: + raise CheckerException ("The field Chrom is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[1] != '': + self.setPosition(lResults[1]) + else: + raise CheckerException ("The field Position is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[2] != '': + self.setRef(lResults[2]) + else: + raise CheckerException ("The field Ref is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[3] != '': + self.setCns(lResults[3]) + else: + raise CheckerException ("The field Cons is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[4] != '': + self.setReadsRef(lResults[4]) + if lResults[5] != '': + self.setReadsVar(lResults[5]) + if lResults[6] != '': + self.setVarFreq(lResults[6]) + if lResults[7] != '': + self.setStrandsRef(lResults[7]) + if lResults[8] != '': + self.setStrandsVar(lResults[8]) + if lResults[9] != '': + self.setQualRef(lResults[9]) + if lResults[10] != '': + self.setQualVar(lResults[10]) + if lResults[11] != '': + self.setPValue(lResults[11]) + if lResults[12] != '': + self.setMapQualRef(lResults[12]) + if lResults[13] != '': + self.setMapQualVar(lResults[13]) + if lResults[14] != '': + self.setReadsRefPlus(lResults[14]) + if lResults[15] != '': + self.setReadsRefMinus(lResults[15]) + if lResults[16] != '': + self.setReadsVarPlus(lResults[16]) + if lResults[17] != '': + self.setReadsVarMinus(lResults[17]) + if lResults[18] != '': + self.setVar(lResults[18]) + else: + raise CheckerException ("The field varAllele is empty in varscan file in line %s" % iCurrentLineNumber) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem) < 19: + raise CheckerException ("This varscan line (l.%s) is not complete" % iCurrentLineNumber) + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) + + def convertVarscanHit_v2_2_8_To_VarscanHit(self): + iVarscanHit = VarscanHit() + iVarscanHit.setChrom(self.getChrom()) + iVarscanHit.setPosition(self.getPosition()) + iVarscanHit.setRef(self.getRef()) + iVarscanHit.setVar(self.getVar()) + iVarscanHit.setReadsRef(self.getReadsRef()) + iVarscanHit.setReadsVar(self.getReadsVar()) + iVarscanHit.setVarFreq(self.getVarFreq()) + iVarscanHit.setStrandsRef(self.getStrandsRef()) + iVarscanHit.setStrandsVar(self.getStrandsVar()) + iVarscanHit.setQualRef(self.getQualRef()) + iVarscanHit.setQualVar(self.getQualVar()) + iVarscanHit.setPValue(self.getPValue()) + return iVarscanHit + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanHit_v2_2_8_WithTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit_v2_2_8_WithTag.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,88 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.CheckerException import CheckerException +from commons.core.parsing.VarscanHit_v2_2_8 import VarscanHit_v2_2_8 +from commons.core.parsing.VarscanHit_WithTag import VarscanHit_WithTag + +class VarscanHit_v2_2_8_WithTag(VarscanHit_v2_2_8): + + def __init__(self, chrom = "", position = "", ref = "", cns = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = "", mapQualRef = "", mapQualVar = "", readsRefPlus = "", readsRefMinus = "", readsVarPlus = "", readsVarMinus = "", var = "", tag = ""): + self._tag = tag + VarscanHit_v2_2_8.__init__(self, chrom, position, ref, var, readsRef, readsVar, varFreq, strandsRef, strandsVar, qualRef, qualVar, pValue, mapQualRef, mapQualVar, readsRefPlus, readsRefMinus, readsVarPlus, readsVarMinus, var) + + def __eq__(self, o): + if self._tag == o._tag: + return VarscanHit_v2_2_8.__eq__(self, o) + return False + + def setTag(self, tag): + self._tag = tag + + def getTag(self): + return self._tag + + def getHeader(self): + return "Chrom\tPosition\tRef\tCons\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\tMapQual1\tMapQual2\tReads1Plus\tReads1Minus\tReads2Plus\tReads2Minus\tVarAllele\tTag\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getCns(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue(), self.getMapQualRef(), self.getMapQualVar(), self.getReadsRefPlus(), self.getReadsRefMinus(), self.getReadsVarPlus(), self.getReadsVarMinus(), self.getVar(), self.getTag()) + + def setAttributes(self, lResults, iCurrentLineNumber): + VarscanHit_v2_2_8.setAttributes(self, lResults, iCurrentLineNumber) + if lResults[19] != '': + self.setTag(lResults[19]) + else: + raise CheckerException ("The field tag is empty in varscan file in line %s" % iCurrentLineNumber) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem) < 20: + raise CheckerException ("This varscan line (l.%s) is not complete" % iCurrentLineNumber) + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) + + def convertVarscanHit_v2_2_8_WithTag_To_VarscanHit_WithTag(self): + iVarscanHit = VarscanHit_WithTag() + iVarscanHit.setChrom(self.getChrom()) + iVarscanHit.setPosition(self.getPosition()) + iVarscanHit.setRef(self.getRef()) + iVarscanHit.setVar(self.getVar()) + iVarscanHit.setReadsRef(self.getReadsRef()) + iVarscanHit.setReadsVar(self.getReadsVar()) + iVarscanHit.setVarFreq(self.getVarFreq()) + iVarscanHit.setStrandsRef(self.getStrandsRef()) + iVarscanHit.setStrandsVar(self.getStrandsVar()) + iVarscanHit.setQualRef(self.getQualRef()) + iVarscanHit.setQualVar(self.getQualVar()) + iVarscanHit.setPValue(self.getPValue()) + iVarscanHit.setTag(self.getTag()) + return iVarscanHit \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/VarscanToVCF.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanToVCF.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,152 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import math +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +from commons.core.parsing.VarscanFile import VarscanFile +from commons.core.seq.Bioseq import Bioseq + +LOG_DEPTH = "core.parsing" + +##Reference launcher implementation +# +class VarscanToVCF(object): + + def __init__(self, varscanFileName = "", vcfFileName = "", doClean = False, verbosity = 0): + self._varscanFileName = varscanFileName + self.setvcfFileName(vcfFileName) + self._doClean = doClean + self._verbosity = verbosity + + self._vcfRevision = "VCFv4.1" + self._vcfHeader = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" + + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "Conver Varscan file to VCF file." + epilog = "\t$ python VarscanToVCF.py -i varscanFileName -v 2" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--Varscan", dest = "varscanFileName", action = "store", type = "string", help = "input Varscan file name [compulsory] [format: varscan2.2.8]", default = "") + parser.add_option("-o", "--vcfFileName",dest = "vcfFileName", action = "store", type = "string", help = "vcfFileName file name [default: .vcf]", default = "") + parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False) + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setvarscanFileName(options.varscanFileName) + self.setvcfFileName(options.vcfFileName) + self.setDoClean(options.doClean) + self.setVerbosity(options.verbosity) + + def setvarscanFileName(self, varscanFileName): + self._varscanFileName = varscanFileName + + def setvcfFileName(self, vcfFileName): + if vcfFileName == "": + self._vcfFileName = "%s.vcf" % self._varscanFileName + else: + self._vcfFileName = vcfFileName + + def setDoClean(self, doClean): + self._doClean = doClean + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if self._varscanFileName == "": + self._logAndRaise("ERROR: Missing input file name") + else: + if not FileUtils.isRessourceExists(self._varscanFileName): + self._logAndRaise("ERROR: Input Varscan file '%s' does not exist!" % self._varscanFileName) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def _convertVarscanLineToVCFRecord(self, varscanLine, lineNumber): + iVarscanFile = VarscanFile() + iVarscanFile.setTypeOfVarscanFile("Varscan_2_2_8") + iVarscanHit = iVarscanFile.createVarscanObjectFromLine(varscanLine, lineNumber) + Chrom = iVarscanHit.getChrom() + Pos = int(iVarscanHit.getPosition()) + #ID = str(lineNumber) + ID = "." + Ref = iVarscanHit.getRef() + Alt = iVarscanHit.getVar() + Qual = -10*math.log10(float(iVarscanHit.getPValue())) + Filter = "." + AF = float(iVarscanHit.getVarFreq()[:-1])/100 + DP = int(iVarscanHit.getReadsRef()) + int(iVarscanHit.getReadsVar()) + RBQ = iVarscanHit.getQualRef() + ABQ = iVarscanHit.getQualVar() + #MQ = iVarscanHit.getMapQualRef() + Info = ";".join(["AF=%.4f" %AF,"DP=%d" %DP,"RBQ=%s" %RBQ, "ABQ=%s" %ABQ]) + + allel = Bioseq().getATGCNFromIUPACandATGCN(iVarscanHit.getCns(), Ref) + if allel != Alt: + self._log.warning("'VarAllele' attribute of Varscan file line '%d' was not correct. Correcting using '%s' instead of '%s'." % (lineNumber, allel, Alt)) + Alt = allel + + vcfLine = "%s\t%s\t%s\t%s\t%s\t%.9f\t%s\t%s\n" % (Chrom, Pos, ID, Ref, Alt, Qual, Filter, Info) + return vcfLine + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START Varscan To VCF") + self._log.debug("Input file name: %s" % self._varscanFileName) + + with open(self._vcfFileName, "w") as fVCF: + fVCF.write("##fileformat=%s\n" % self._vcfRevision) + fVCF.write("%s\n" % self._vcfHeader) + + with open(self._varscanFileName, "r") as fVarscan: + lineNumber = 1 + line = fVarscan.readline() + while line: + if line[0] != "#" and "Chrom\tPosition\tRef\tCons" not in line: + vcfLine = self._convertVarscanLineToVCFRecord(line, lineNumber) + fVCF.write(vcfLine) + line = fVarscan.readline() + lineNumber += 1 + + self._log.info("END Varscan To VCF") + +if __name__ == "__main__": + iLaunch = VarscanToVCF() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/WigParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/WigParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,333 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +import os.path +import struct +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript + +STRANDTOSTR = {1: "(+)", 0: "(=)", None: "(=)", -1: "(-)"} + +nbOpenHandles = 30 + + +class WigParser(TranscriptListParser): + """A class that parses a big WIG file, creates an index and make it possible to quickly retrieve some data""" + + def __init__(self, fileName, verbosity = 1): + self.fileName = fileName + self.filler = "\xFF" * struct.calcsize('Q') + self.strands = False + self.indexFiles = {} + self.indexBuilt = False + self.defaultValue = 0.0 + self.currentChromosome = None + self.currentStrand = 1 + self.verbosity = verbosity + super(WigParser, self).__init__(fileName, verbosity) + + + def __def__(self): + for file in self.indexFiles.values(): + file.close() + + + def setStrands(self, strands): + self.strands = strands + + + def setDefaultValue(self, value): + self.defaultValue = value + + + def getFileFormats(): + return ["wig"] + getFileFormats = staticmethod(getFileFormats) + + + def setStrands(self, strands): + """ + Consider both strands separately + """ + self.strands = strands + + + def makeIndexName(self, chromosome, strand = None): + """ + Create an index name for a file + """ + directoryName = os.path.dirname(self.fileName) + if strand == None: + strandName = "" + else: + strandName = "+" if strand == 1 else "-" + indexName = os.path.join(directoryName, ".%s%s.index" % (chromosome, strandName)) + return indexName + + + def findIndexFile(self, chromosome, strand = None): + """ + Check if the index of a file exists + """ + indexName = self.makeIndexName(chromosome, strand) + if os.path.exists(indexName): + return indexName + return False + + + def makeIndexFile(self): + """ + Create the index for a file + """ + if self.indexBuilt: + return + + inputFile = open(self.fileName) + outputFile = None + index = 0 + mark = inputFile.tell() + line = inputFile.readline().strip() + chromosome = None + + while line != "": + m1 = re.search(r"^\s*-?\d+\.?\d*\s*$", line) + m2 = re.search(r"^\s*(\d+)\s+-?\d+\.?\d*\s*$", line) + m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=1\s*$", line) + m4 = re.search(r"^\s*fixedStep\s+chrom=\S+\s+start=\d+\s+step=\d+\s+span=\d+\s*$", line) + m5 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) + m6 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s+span=(\d+)\s*$", line) + + if m1 != None: + outputFile.write(struct.pack("Q", mark)) + index += 1 + elif m2 != None: + nextIndex = int(m2.group(1)) + if index < nextIndex - 1: + outputFile.write(self.filler * (nextIndex - index - 1)) + outputFile.write(struct.pack("Q", mark)) + index = nextIndex + elif m3 != None: + newChromosome = m3.group(1) + if newChromosome != chromosome: + if outputFile != None: + outputFile.close() + outputFile = open(self.makeIndexName(newChromosome), "wb") + chromosome = newChromosome + nextIndex = int(m3.group(2)) + outputFile.write(self.filler * (nextIndex - index)) + index = nextIndex + elif m4 != None: + raise Exception("Error! Cannot parse fixed step WIG files with step > 1 or span > 1") + elif m5 != None: + newChromosome = m5.group(1) + if newChromosome != chromosome: + if outputFile != None: + outputFile.close() + outputFile = open(self.makeIndexName(newChromosome), "wb") + index = 0 + outputFile.write(self.filler) + chromosome = newChromosome + elif m6 != None: + if m6.group(2) != "1": + raise Exception("Error! Cannot parse variable step WIG files with step > 1 or span > 1") + newChromosome = m6.group(1) + if newChromosome != chromosome: + if outputFile != None: + outputFile.close() + outputFile = open(self.makeIndexName(newChromosome), "wb") + index = 0 + outputFile.write(self.filler) + chromosome = newChromosome + elif (len(line) == 0) or line[0] == "#" or line.startswith("track"): + pass + else: + raise Exception("Error! Cannot understand line '%s' of WIG file while creating index file! Aborting." % (line)) + + mark = inputFile.tell() + line = inputFile.readline().strip() + + inputFile.close + if outputFile != None: + outputFile.close() + self.indexBuilt = True + + + def getIndexFileHandle(self, chromosome, strand = None): + """ + Get the handle of an index file + """ + indexFileKey = chromosome + if strand != None: + indexFileKey += "+" if strand == 1 else "-" + if indexFileKey in self.indexFiles: + return self.indexFiles[indexFileKey] + + indexFileName = self.makeIndexName(chromosome, strand) + if not self.findIndexFile(chromosome, strand): + self.makeIndexFile() + + if not os.path.exists(indexFileName): + print "Warning! Index for chromosome %s, strand %s does not exist." % (chromosome, STRANDTOSTR[strand]) + return False + indexFile = open(indexFileName, "rb") + + if len(self.indexFiles.keys()) > nbOpenHandles: + removedKey = set(self.indexFiles.keys()).pop() + self.indexFiles[removedKey].close() + del self.indexFiles[removedKey] + self.indexFiles[indexFileKey] = indexFile + return indexFile + + + + def findIndex(self, chromosome, start, strand = None): + """ + Find the point where to start reading file + """ + + sizeOfLong = struct.calcsize("Q") + empty = int(struct.unpack("Q", self.filler)[0]) + offset = empty + indexFile = self.getIndexFileHandle(chromosome, strand) + + if not indexFile: + return (None, None) + + while offset == empty: + address = start * sizeOfLong + indexFile.seek(address, os.SEEK_SET) + + buffer = indexFile.read(sizeOfLong) + if len(buffer) != sizeOfLong: + if buffer == "": + print "Warning! Index position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand]) + return (None, None) + else: + raise Exception("Problem fetching position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand])) + + offset = int(struct.unpack("Q", buffer)[0]) + start += 1 + + start -= 1 + return (offset, start) + + + + def getRange(self, chromosome, start, end): + """ + Parse a wig file and output a range + """ + arrays = {} + strands = {1: "+", -1: "-"} if self.strands else {0: ""} + + for strand in strands: + + array = [self.defaultValue] * (end - start + 1) + file = open(self.fileName) + offset, index = self.findIndex(chromosome, start, strand if self.strands else None) + if offset == None: + arrays[strand] = array + continue + file.seek(offset, os.SEEK_SET) + + for line in file: + line = line.strip() + + m1 = re.search(r"^\s*(-?\d+\.?\d*)\s*$", line) + m2 = re.search(r"^\s*(\d+)\s+(-?\d+\.?\d*)\s*$", line) + m3 = re.search(r"^\s*fixedStep\s+chrom=(\S+)\s+start=(\d+)\s+step=\d+\s*$", line) + m4 = re.search(r"^\s*variableStep\s+chrom=(\S+)\s*$", line) + + if m1 != None: + if index > end: + break + if index >= start: + array[index - start] = float(m1.group(1)) + index += 1 + elif m2 != None: + index = int(m2.group(1)) + if index > end: + break + if index >= start: + array[index - start] = float(m2.group(2)) + index += 1 + elif m3 != None: + if m3.group(1) != "%s%s" % (chromosome, strands[strand]): + break + index = int(m3.group(2)) + elif m4 != None: + if m4.group(1) != "%s%s" % (chromosome, strands[strand]): + break + elif (len(line) == 0) or (line[0] == "#") or line.startswith("track"): + pass + else: + raise Exception("Error! Cannot read line '%s' of wig file" % (line)) + + file.close() + + arrays[strand] = array + + if self.strands: + return arrays + return array + + + def skipFirstLines(self): + return + + + def parseLine(self, line): + if line.startswith("track"): + return None + m = re.search(r"^\s*variableStep\s+chrom=(\S+)", line) + if m != None: + chromosome = m.group(1) + if chromosome.endswith("+"): + self.currentStrand = 1 + self.currentChromosome = chromosome[:-1] + elif chromosome.endswith("-"): + self.currentStrand = -1 + self.currentChromosome = chromosome[:-1] + else: + self.currentStrand = 1 + self.currentChromosome = chromosome + return None + position, value = line.split() + position = int(position) + value = float(value) + transcript = Transcript() + transcript.setChromosome(self.currentChromosome) + transcript.setStart(position) + transcript.setEnd(position) + transcript.setDirection(self.currentStrand) + transcript.setTagValue("ID", "wig_%s_%d_%d" % (self.currentChromosome, self.currentStrand, position)) + transcript.setTagValue("nbElements", value) + return transcript diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/parsing/multifastaParserLauncher.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/multifastaParserLauncher.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +""" +Launcher for the multifasta parser. +@param b: Name of the batch of sequences +@param g: Name of the gene +@param t: Scientific name of the taxon concerned +@param f: Name of the multifasta input file +""" + + +import os +import sys +import getopt +from commons.core.parsing.Multifasta2SNPFile import Multifasta2SNPFile + +CURRENT_DIR = os.getcwd() + +def help(): + + """ + Give the list of the command-line options. + """ + + print "Usage: ",sys.argv[0],"[ options ]" + print " -h: this help" + print "Mandatory option:" + print " -t: Scientific name of the taxon concerned" + print "Exclusive options (use either the first or the second, one should be used)" + print " -f: Name of the multifasta input file in one batch mode" + print " -d: Name of the directory containing multifasta input file(s) in multi-batch mode" + print "Only in one batch mode: mandatory options (when -f is used):" + print " -b: Name of the batch of submitted sequences" + print " -g: Name of the gene" + print "" + + +def runOneInputFile(batchName, geneName, taxon, inputFileName): + print "Multifasta parseur launched:!\n" + print "-- Input File: " + inputFileName + "\n" + print "-- Batch name: " + batchName + "\n" + print "-- Gene name: " + geneName + "\n" + print "-- Taxon: " + taxon + "\n" + #TODO: gerer le delete des fichiers(mode append) + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, geneName) + multifasta2SNPFile.runOneBatch(inputFileName) + print "OK: Files generated!" + + +def runSeveralInputFile(taxon, rootDirectoryName): + multifasta2SNPFile = Multifasta2SNPFile(taxon) + multifasta2SNPFile.runSeveralBatches(rootDirectoryName) + +def main(): + batchName = "" + geneName = "" + taxon = "" + inputFileName = "" + rootDirectoryName = "" + + + try: + opts,args = getopt.getopt(sys.argv[1:],"hb:g:t:f:d:") + except getopt.GetoptError: + print "Invalid options\n" + help() + sys.exit(2) + + for o, a in opts: + if o == "-h": + help() + exit(0) + elif o == "-b": + batchName = a + elif o == "-g": + geneName = a + elif o == "-t": + taxon = a + elif o == "-f": + inputFileName = a + elif o == "-d": + rootDirectoryName = os.path.abspath(a) + + if taxon == "": + print "*** Error: The mandatory option -t is missing" + help() + sys.exit(1) + + if (inputFileName == "" and rootDirectoryName == "") or (inputFileName != "" and rootDirectoryName != ""): + print "*** Error: You have to specify the input mode: choose either -f (for one file) or -d (for one directory of several files)" + help() + sys.exit(1) + + if(inputFileName != ""): + if batchName == "" or geneName == "": + print "*** Error: A mandatory option is missing in one batch mode (-b or -g)" + help() + sys.exit(1) + + if(inputFileName != ""): + runOneInputFile(batchName, geneName, taxon, inputFileName) + else: + runSeveralInputFile(taxon, rootDirectoryName) + + + return 0 + +#------------------------------------------------------------------------------ +if __name__ == "__main__": + main() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/AlignedBioseqDB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/AlignedBioseqDB.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,440 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.Bioseq import Bioseq +from commons.core.coord.Align import Align +from commons.core.coord.Range import Range +from commons.core.stat.Stat import Stat +from math import log + + +## Multiple Sequence Alignment Representation +# +# +class AlignedBioseqDB( BioseqDB ): + + def __init__( self, name="" ): + BioseqDB.__init__( self, name ) + seqLength = self.getLength() + if self.getSize() > 1: + for bs in self.db[1:]: + if bs.getLength() != seqLength: + print "ERROR: aligned sequences have different length" + + + ## Get length of the alignment + # + # @return length + # @warning name before migration was 'length' + # + def getLength( self ): + length = 0 + if self.db != []: + length = self.db[0].getLength() + return length + + + ## Get the true length of a given sequence (without gaps) + # + # @param header string header of the sequence to analyze + # @return length integer + # @warning name before migration was 'true_length' + # + def getSeqLengthWithoutGaps( self, header ): + bs = self.fetch( header ) + count = 0 + for pos in xrange(0,len(bs.sequence)): + if bs.sequence[pos] != "-": + count += 1 + return count + + def cleanMSA( self ): + #TODO: Refactoring + """clean the MSA""" + i2del = [] + + # for each sequence in the MSA + for seqi in xrange(0,self.getSize()): + if seqi in i2del: + continue + #define it as the reference + ref = self.db[seqi].sequence + refHeader = self.db[seqi].header + # for each following sequence + for seq_next in xrange(seqi+1,self.getSize()): + if seq_next in i2del: + continue + keep = 0 + # for each position along the MSA + for posx in xrange(0,self.getLength()): + seq = self.db[seq_next].sequence + if seq[posx] != '-' and ref[posx] != '-': + keep = 1 + break + seqHeader = self.db[seq_next].header + # if there is at least one gap between the ref seq and the other seq + # keep track of the shortest by recording it in "i2del" + if keep == 0: + + if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader): + if seqi not in i2del: + i2del.append( seqi ) + else: + if seq_next not in i2del: + i2del.append( seq_next ) + + # delete from the MSA each seq present in the list "i2del" + for i in reversed(sorted(set(i2del))): + del self.db[i] + + self.idx = {} + count = 0 + for i in self.db: + self.idx[i.header] = count + count += 1 + + ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site + # + # @return: list of dico whose keys are symbols and values are their occurrences + # + def getListOccPerSite( self ): + lOccPerSite = [] # list of dictionaries, one per position on the sequence + n = 0 # nb of sequences parsed from the input file + firstSeq = True + + # for each sequence in the bank + for bs in self.db: + if bs.sequence == None: + break + n += 1 + + # if it is the first to be parsed, create a dico at each site + if firstSeq: + for i in xrange(0,len(bs.sequence)): + lOccPerSite.append( {} ) + firstSeq = False + + # for each site, add its nucleotide + for i in xrange(0,len(bs.sequence)): + nuc = bs.sequence[i].upper() + if lOccPerSite[i].has_key( nuc ): + lOccPerSite[i][nuc] += 1 + else: + lOccPerSite[i][nuc] = 1 + + return lOccPerSite + + #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus... + ## Make a consensus from the MSA + # + # @param minNbNt: minimum nb of nucleotides to edit a consensus + # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0) + # @param verbose: level of information sent to stdout (default=0/1) + # @return: consensus + # + def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False): + + maxPropN = 0.40 # discard consensus if more than 40% of N's + + nbInSeq = self.getSize() + if verbose > 0: + print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush() + if nbInSeq < 2: + print "ERROR: can't make a consensus with less than 2 sequences" + sys.exit(1) + if minNbNt >= nbInSeq: + minNbNt = nbInSeq - 1 + print "minNbNt=%i" % ( minNbNt ) + if minPropNt >= 1.0: + print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt ) + sys.exit(1) + + lOccPerSite = self.getListOccPerSite() + nbSites = len(lOccPerSite) + if verbose > 0: + print "nb of sites: %i" % ( nbSites ); sys.stdout.flush() + + seqConsensus = "" + + # for each site (i.e. each column of the MSA) + nbRmvColumns = 0 + countSites = 0 + for dNt2Occ in lOccPerSite: + countSites += 1 + if verbose > 1: + print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ), + nbSites ) + sys.stdout.flush() + occMaxNt = 0 # occurrences of the predominant nucleotide at this site + lBestNt = [] + nbNt = 0 # total nb of A, T, G and C (no gap) + + # for each distinct symbol at this site (A, T, G, C, N, -,...) + for j in dNt2Occ.keys(): + if j != "-": + nbNt += dNt2Occ[j] + if verbose > 1: + print "%s: %i" % ( j, dNt2Occ[j] ) + if dNt2Occ[j] > occMaxNt: + occMaxNt = dNt2Occ[j] + lBestNt = [ j ] + elif dNt2Occ[j] == occMaxNt: + lBestNt.append( j ) + if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq) + nbRmvColumns += 1 + + if len( lBestNt ) >= 1: + bestNt = lBestNt[0] + + # if the predominant nucleotide occurs in less than x% of the sequences, put a "N" + if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt: + bestNt = "N" + + if int(nbNt) >= int(minNbNt): + seqConsensus += bestNt + if verbose > 1: + print "-> %s" % ( bestNt ) + + if nbRmvColumns: + if nbRmvColumns == 1: + print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100) + else: + print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 ) + sys.stdout.flush() + if seqConsensus == "": + print "WARNING: no consensus can be built (no sequence left)" + return + + propN = seqConsensus.count("N") / float(len(seqConsensus)) + if propN >= maxPropN: + print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 ) + return + elif propN >= maxPropN * 0.5: + print "WARNING: %i%% of N's" % ( propN * 100 ) + + consensus = Bioseq() + consensus.sequence = seqConsensus + if isHeaderSAtannot: + header = self.db[0].header + pyramid = header.split("Gr")[1].split("Cl")[0] + pile = header.split("Cl")[1].split(" ")[0] + consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid) + else: + consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() ) + + if verbose > 0: + + statEntropy = self.getEntropy( verbose - 1 ) + print "entropy: %s" % ( statEntropy.stringQuantiles() ) + sys.stdout.flush() + + return consensus + + + ## Get the entropy of the whole multiple alignment (only for A, T, G and C) + # + # @param verbose level of verbosity + # + # @return statistics about the entropy of the MSA + # + def getEntropy( self, verbose=0 ): + + stats = Stat() + + # get the occurrences of symbols at each site + lOccPerSite = self.getListOccPerSite() + + countSite = 0 + + # for each site + for dSymbol2Occ in lOccPerSite: + countSite += 1 + + # count the number of nucleotides (A, T, G and C, doesn't count gap '-') + nbNt = 0 + dATGC2Occ = {} + for base in ["A","T","G","C"]: + dATGC2Occ[ base ] = 0.0 + for nt in dSymbol2Occ.keys(): + if nt != "-": + nbNt += dSymbol2Occ[ nt ] + checkedNt = self.getATGCNFromIUPAC( nt ) + if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ): + dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ] + else: # for 'N' + if dSymbol2Occ.has_key( checkedNt ): + dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ] + dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ] + dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ] + dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ] + if verbose > 2: + for base in dATGC2Occ.keys(): + print "%s: %i" % ( base, dATGC2Occ[ base ] ) + + # compute the entropy for the site + entropySite = 0.0 + for nt in dATGC2Occ.keys(): + entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt ) + if verbose > 1: + print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite ) + stats.add( entropySite ) + + return stats + + + ## Get A, T, G, C or N from an IUPAC letter + # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] + # + # @return A, T, G, C or N + # + def getATGCNFromIUPAC( self, nt ): + iBs = Bioseq() + return iBs.getATGCNFromIUPAC( nt ) + + + ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides + # + def computeEntropy( self, nbOcc, nbNt ): + if nbOcc == 0.0: + return 0.0 + else: + freq = nbOcc / float(nbNt) + return - freq * log(freq) / log(2) + + + ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise + # + def saveAsBinaryMatrix( self, outFile ): + outFileHandler = open( outFile, "w" ) + for bs in self.db: + string = "%s" % ( bs.header ) + for nt in bs.sequence: + if nt != "-": + string += "\t%i" % ( 1 ) + else: + string += "\t%i" % ( 0 ) + outFileHandler.write( "%s\n" % ( string ) ) + outFileHandler.close() + + + ## Return a list of Align instances corresponding to the aligned regions (without gaps) + # + # @param query string header of the sequence considered as query + # @param subject string header of the sequence considered as subject + # + def getAlignList( self, query, subject ): + lAligns = [] + alignQ = self.fetch( query ).sequence + alignS = self.fetch( subject ).sequence + createNewAlign = True + indexAlign = 0 + indexQ = 0 + indexS = 0 + while indexAlign < len(alignQ): + if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-": + indexQ += 1 + indexS += 1 + if createNewAlign: + iAlign = Align( Range( query, indexQ, indexQ ), + Range( subject, indexS, indexS ), + 0, + int( alignQ[ indexAlign ] == alignS[ indexAlign ] ), + int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) ) + lAligns.append( iAlign ) + createNewAlign = False + else: + lAligns[-1].range_query.end += 1 + lAligns[-1].range_subject.end += 1 + lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) + lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) + else: + if not createNewAlign: + lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() + createNewAlign = True + if alignQ[ indexAlign ] != "-": + indexQ += 1 + elif alignS[ indexAlign ] != "-": + indexS += 1 + indexAlign += 1 + if not createNewAlign: + lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery() + return lAligns + + + def removeGaps(self): + for iBs in self.db: + iBs.removeSymbol( "-" ) + + ## Compute mean per cent identity for MSA. + # First sequence in MSA is considered as reference sequence. + # + # + def computeMeanPcentIdentity(self): + seqRef = self.db[0] + sumPcentIdentity = 0 + + for seq in self.db[1:]: + pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq) + sumPcentIdentity = sumPcentIdentity + pcentIdentity + + nbSeq = len(self.db[1:]) + meanPcentIdentity = round (sumPcentIdentity/nbSeq) + + return meanPcentIdentity + + def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq): + indexOnSeqRef = 0 + sumIdentity = 0 + for nuclSeq in seq.sequence: + nuclRef = seqRef.sequence[indexOnSeqRef] + + if nuclRef != "-" and nuclRef == nuclSeq: + sumIdentity = sumIdentity + 1 + indexOnSeqRef = indexOnSeqRef + 1 + + return float(sumIdentity) / float(seqRef.getLength()) * 100 + + + + + + + + + + + + + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/Bioseq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/Bioseq.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,735 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +import string +import re +import random +import cStringIO +from commons.core.coord.Map import Map +from commons.core.checker.RepetException import RepetException + +DNA_ALPHABET_WITH_N = set( ['A','T','G','C','N'] ) +IUPAC = set(['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']) + + +## Record a sequence with its header +# +class Bioseq( object ): + + header = "" + sequence = "" + + ## constructor + # + # @param name the header of sequence + # @param seq sequence (DNA, RNA, protein) + # + def __init__( self, name="", seq="" ): + self.header = name + self.sequence = seq + + + ## Equal operator + # + def __eq__( self, o ): + if self.header==o.header and self.sequence==o.sequence: + return True + return False + + + ## overload __repr__ + # + def __repr__( self ): + return "%s;%s" % ( self.header, self.sequence ) + + + ## set attribute header + # + # @param header a string + # + def setHeader( self, header ): + self.header = header + + + ## get attribute header + # + # @return header + def getHeader(self): + return self.header + + + ## set attribute sequence + # + # @param sequence a string + # + def setSequence( self, sequence ): + self.sequence = sequence + + + def getSequence(self): + return self.sequence + + ## reset + # + def reset( self ): + self.setHeader( "" ) + self.setSequence( "" ) + + + ## Test if bioseq is empty + # + def isEmpty( self ): + return self.header == "" and self.sequence == "" + + + ## Reverse the sequence + # + def reverse( self ): + tmp = self.sequence + self.sequence = tmp[::-1] + + + ## Turn the sequence into its complement + # Force upper case letters + # @warning: old name in pyRepet.Bioseq realComplement + # + def complement( self ): + complement = "" + self.upCase() + for i in xrange(0,len(self.sequence),1): + if self.sequence[i] == "A": + complement += "T" + elif self.sequence[i] == "T": + complement += "A" + elif self.sequence[i] == "C": + complement += "G" + elif self.sequence[i] == "G": + complement += "C" + elif self.sequence[i] == "M": + complement += "K" + elif self.sequence[i] == "R": + complement += "Y" + elif self.sequence[i] == "W": + complement += "W" + elif self.sequence[i] == "S": + complement += "S" + elif self.sequence[i] == "Y": + complement += "R" + elif self.sequence[i] == "K": + complement += "M" + elif self.sequence[i] == "V": + complement += "B" + elif self.sequence[i] == "H": + complement += "D" + elif self.sequence[i] == "D": + complement += "H" + elif self.sequence[i] == "B": + complement += "V" + elif self.sequence[i] == "N": + complement += "N" + elif self.sequence[i] == "-": + complement += "-" + else: + print "WARNING: unknown symbol '%s', replacing it by N" % ( self.sequence[i] ) + complement += "N" + self.sequence = complement + + + ## Reverse and complement the sequence + # + # Force upper case letters + # @warning: old name in pyRepet.Bioseq : complement + # + def reverseComplement( self ): + self.reverse() + self.complement() + + + ## Remove gap in the sequence + # + def cleanGap(self): + self.sequence = self.sequence.replace("-","") + + + ## Copy current Bioseq Instance + # + # @return: a Bioseq instance, a copy of current sequence. + # + def copyBioseqInstance(self): + seq = Bioseq() + seq.sequence = self.sequence + seq.header = self.header + return seq + + + ## Add phase information after the name of sequence in header + # + # @param phase integer representing phase (1, 2, 3, -1, -2, -3) + # + def setFrameInfoOnHeader(self, phase): + if " " in self.header: + name, desc = self.header.split(" ", 1) + name = name + "_" + str(phase) + self.header = name + " " + desc + else: + self.header = self.header + "_" + str(phase) + + + ## Fill Bioseq attributes with fasta file + # + # @param faFileHandler file handler of a fasta file + # + def read( self, faFileHandler ): + line = faFileHandler.readline() + if line == "": + self.header = None + self.sequence = None + return + while line == "\n": + line = faFileHandler.readline() + if line[0] == '>': + self.header = string.rstrip(line[1:]) + else: + print "error, line is",string.rstrip(line) + return + line = " " + seq = cStringIO.StringIO() + while line: + prev_pos = faFileHandler.tell() + line = faFileHandler.readline() + if line == "": + break + if line[0] == '>': + faFileHandler.seek( prev_pos ) + break + seq.write( string.rstrip(line) ) + self.sequence = seq.getvalue() + + + ## Create a subsequence with a modified header + # + # @param s integer start a required subsequence + # @param e integer end a required subsequence + # + # @return a Bioseq instance, a subsequence of current sequence + # + def subseq( self, s, e=0 ): + if e == 0 : + e=len( self.sequence ) + if s > e : + print "error: start must be < or = to end" + return + if s <= 0 : + print "error: start must be > 0" + return + sub = Bioseq() + sub.header = self.header + " fragment " + str(s) + ".." + str(e) + sub.sequence = self.sequence[(s-1):e] + return sub + + + ## Get the nucleotide or aminoacid at the given position + # + # @param pos integer nucleotide or aminoacid position + # + # @return a string + # + def getNtFromPosition(self, pos): + result = None + if not (pos < 1 or pos > self.getLength()): + result = self.sequence[pos - 1] + return result + + + ## Print in stdout the Bioseq in fasta format with 60 characters lines + # + # @param l length of required sequence default is whole sequence + # + def view(self,l=0): + print '>'+self.header + i=0 + if(l==0): + l=len(self.sequence) + seq=self.sequence[0:l] + + while iMbS1566Gr81Cl81 Dmel_Grouper_3091_Malign_3:LARD {Fragment} 1..5203' + # @return header (string) + # + def getHeaderFullSeq( self ): + data = self.header.split() + return data[1] + + + ## Get the strand of the fragment (output from Grouper) + # + # @return: strand (+ or -) + # + def getFragStrand( self ): + data = self.header.split() + coord = data[3].split("..") + if int(coord[0]) < int(coord[-1]): + return "+" + else: + return "-" + + + ## Get A, T, G, C or N from an IUPAC letter + # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N'] + # + # @return A, T, G, C or N + # + def getATGCNFromIUPAC( self, nt ): + subset = ["A","T","G","C","N"] + + if nt in subset: + return nt + elif nt == "U": + return "T" + elif nt == "R": + return random.choice( "AG" ) + elif nt == "Y": + return random.choice( "CT" ) + elif nt == "M": + return random.choice( "CA" ) + elif nt == "K": + return random.choice( "TG" ) + elif nt == "W": + return random.choice( "TA" ) + elif nt == "S": + return random.choice( "CG" ) + elif nt == "B": + return random.choice( "CTG" ) + elif nt == "D": + return random.choice( "ATG" ) + elif nt == "H": + return random.choice( "ATC" ) + elif nt == "V": + return random.choice( "ACG" ) + else: + return "N" + + ## Get nucleotide from an IUPAC letter and a nucleotide + # Works only for IUPAC code with two possibilities ['R','Y','M','K','W','S'] + # Examples: + # Y and C returns T + # Y and T returns C + # B and C throws RepetException + # + # @return A, T, G, C + # + def getATGCNFromIUPACandATGCN(self, IUPACCode, nt): + if IUPACCode == "R": + possibleNt = set(["A", "G"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "Y": + possibleNt = set(["C", "T"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "M": + possibleNt = set(["A", "C"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "K": + possibleNt = set(["T", "G"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "W": + possibleNt = set(["A", "T"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + elif IUPACCode == "S": + possibleNt = set(["C", "G"]) + if nt not in possibleNt: + raise RepetException("IUPAC code '%s' and nucleotide '%s' are not compatible" % (IUPACCode, nt)) + return (possibleNt - set(nt)).pop() + + else: + raise RepetException("Can't retrieve the third nucleotide from IUPAC code '%s' and nucleotide '%s'" % (IUPACCode, nt)) + + def getSeqWithOnlyATGCN( self ): + newSeq = "" + for nt in self.sequence: + newSeq += self.getATGCNFromIUPAC( nt ) + return newSeq + + + ## Replace any symbol not in (A,T,G,C,N) by another nucleotide it represents + # + def partialIUPAC( self ): + self.sequence = self.getSeqWithOnlyATGCN() + + + ## Remove non Unix end-of-line symbols, if any + # + def checkEOF( self ): + symbol = "\r" # corresponds to '^M' from Windows + if symbol in self.sequence: + print "WARNING: Windows EOF removed in '%s'" % ( self.header ) + sys.stdout.flush() + newSeq = self.sequence.replace( symbol, "" ) + self.sequence = newSeq + + + ## Write Bioseq instance into a fasta file handler + # + # @param faFileHandler file handler of a fasta file + # + def write( self, faFileHandler ): + faFileHandler.write( ">%s\n" % ( self.header ) ) + self.writeSeqInFasta( faFileHandler ) + + + ## Write only the sequence of Bioseq instance into a fasta file handler + # + # @param faFileHandler file handler of a fasta file + # + def writeSeqInFasta( self, faFileHandler ): + i = 0 + while i < self.getLength(): + faFileHandler.write( "%s\n" % ( self.sequence[i:i+60] ) ) + i += 60 + + + ## Append Bioseq instance to a fasta file + # + # @param faFile name of a fasta file as a string + # @param mode 'write' or 'append' + # + def save( self, faFile, mode="a" ): + faFileHandler = open( faFile, mode ) + self.write( faFileHandler ) + faFileHandler.close() + + + ## Append Bioseq instance to a fasta file + # + # @param faFile name of a fasta file as a string + # + def appendBioseqInFile( self, faFile ): + self.save( faFile, "a" ) + + + ## Write Bioseq instance into a fasta file handler + # + # @param faFileHandler file handler on a file with writing right + # + def writeABioseqInAFastaFile( self, faFileHandler ): + self.write( faFileHandler ) + + + ## Write Bioseq instance with other header into a fasta file handler + # + # @param faFileHandler file handler on a file with writing right + # @param otherHeader a string representing a new header (without the > and the \n) + # + def writeWithOtherHeader( self, faFileHandler, otherHeader ): + self.header = otherHeader + self.write( faFileHandler ) + + + ## Append Bioseq header and Bioseq sequence in a fasta file + # + # @param faFileHandler file handler on a file with writing right + # @param otherHeader a string representing a new header (without the > and the \n) + # + def writeABioseqInAFastaFileWithOtherHeader( self, faFileHandler, otherHeader ): + self.writeWithOtherHeader( faFileHandler, otherHeader ) + + + ## get the list of Maps corresponding to seq without gap + # + # @warning This method was called getMap() in pyRepet.Bioseq + # @return a list of Map object + # + def getLMapWhithoutGap( self ): + lMaps = [] + countSite = 1 + countSubseq = 1 + inGap = False + startMap = -1 + endMap = -1 + + # initialize with the first site + if self.sequence[0] == "-": + inGap = True + else: + startMap = countSite + + # for each remaining site + for site in self.sequence[1:]: + countSite += 1 + + # if it is a gap + if site == "-": + + # if this is the beginning of a gap, record the previous subsequence + if inGap == False: + inGap = True + endMap = countSite - 1 + lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) ) + countSubseq += 1 + + # if it is NOT a gap + if site != "-": + + # if it is the end of a gap, begin the next subsequence + if inGap == True: + inGap = False + startMap = countSite + + # if it is the last site + if countSite == self.getLength(): + endMap = countSite + lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) ) + + return lMaps + + + ## get the percentage of GC + # + # @return a percentage + # + def getGCpercentage( self ): + tmpSeq = self.getSeqWithOnlyATGCN() + nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" ) + return 100 * nbGC / float( self.getLength() ) + + ## get the percentage of GC of a sequence without counting N in sequence length + # + # @return a percentage + # + def getGCpercentageInSequenceWithoutCountNInLength(self): + tmpSeq = self.getSeqWithOnlyATGCN() + nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" ) + return 100 * nbGC / float( self.getLength() - self.countNt("N") ) + + ## get the 5 prime subsequence of a given length at the given position + # + # @param position integer + # @param flankLength integer subsequence length + # @return a sequence string + # + def get5PrimeFlank(self, position, flankLength): + if(position == 1): + return "" + else: + startOfFlank = 1 + endOfFlank = position -1 + + if((position - flankLength) > 0): + startOfFlank = position - flankLength + else: + startOfFlank = 1 + + return self.subseq(startOfFlank, endOfFlank).sequence + + + ## get the 3 prime subsequence of a given length at the given position + # In the case of indels, the polymorphism length can be specified + # + # @param position integer + # @param flankLength integer subsequence length + # @param polymLength integer polymorphism length + # @return a sequence string + # + def get3PrimeFlank(self, position, flankLength, polymLength = 1): + if((position + polymLength) > len( self.sequence )): + return "" + else: + startOfFlank = position + polymLength + + if((position+polymLength+flankLength) > len( self.sequence )): + endOfFlank = len( self.sequence ) + else: + endOfFlank = position+polymLength+flankLength-1 + + return self.subseq(startOfFlank, endOfFlank).sequence + + + def _createWordList(self,size,l=['A','T','G','C']): + if size == 1 : + return l + else: + l2 = [] + for i in l: + for j in ['A','T','G','C']: + l2.append( i + j ) + return self._createWordList(size-1,l2) + + + def removeSymbol( self, symbol ): + tmp = self.sequence.replace( symbol, "" ) + self.sequence = tmp diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/BioseqDB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/BioseqDB.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,461 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +import re +from commons.core.seq.Bioseq import Bioseq +from commons.core.stat.Stat import Stat + + +## Handle a collection of a Bioseq (header-sequence) +# +class BioseqDB( object ): + + def __init__( self, name="" ): + self.idx = {} + self.idx_renamed = {} + self.db = [] + self.name = name + if name != "": + faFile = open( name ) + self.read( faFile ) + faFile.close() + self.mean_seq_lgth = None + self.stat = Stat() + + + ## Equal operator + # + def __eq__( self, o ): + selfSize = self.getSize() + if selfSize != o.getSize(): + return False + nbEqualInstances = 0 + for i in self.db: + atLeastOneIsEqual = False + for j in o.db: + if i == j: + atLeastOneIsEqual = True + continue + if atLeastOneIsEqual: + nbEqualInstances += 1 + if nbEqualInstances == selfSize: + return True + return False + + + ## Change the name of the BioseqDB + # + # @param name the BioseqDB name + # + def setName(self, name): + self.name = name + + + ## Record each sequence of the input file as a list of Bioseq instances + # + # @param faFileHandler handler of a fasta file + # + def read( self, faFileHandler ): + while True: + seq = Bioseq() + seq.read( faFileHandler ) + if seq.sequence == None: + break + self.add( seq ) + + + ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) + # + # @param faFileHandler file handler of a fasta file + # + def write( self, faFileHandler ): + for bs in self.db: + bs.writeABioseqInAFastaFile( faFileHandler ) + + + ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) + # + # @param outFaFileName file name of fasta file + # @param mode 'write' or 'append' + # + def save( self, outFaFileName, mode="w" ): + outFaFile = open( outFaFileName, mode ) + self.write( outFaFile ) + outFaFile.close() + + + ## Read a formatted fasta file and load it in the BioseqDB instance + # + # @param inFaFileName file name of fasta file + # + def load(self, inFaFileName): + fichier = open(inFaFileName) + self.read(fichier) + fichier.close() + + + ## Reverse each sequence of the collection + # + def reverse( self ): + for bs in self.db: + bs.reverse() + + + ## Turn each sequence into its complement + # + def complement( self ): + for bs in self.db: + bs.complement() + + + ## Reverse and complement each sequence + # + def reverseComplement( self ): + for bs in self.db: + bs.reverseComplement() + + + ## Set the collection from a list of Bioseq instances + # + def setData( self, lBioseqs ): + for i in lBioseqs: + self.add( i ) + + + ## Initialization of each attribute of the collection + # + def reset( self ): + self.db = [] + self.idx = {} + self.name = None + self.mean_seq_lgth = None + self.stat.reset() + + + ## Remove all the gap of the sequences of the collection + # + def cleanGap(self): + for iBioSeq in self.db: + iBioSeq.cleanGap() + + + ## Add a Bioseq instance and update the attributes + # + # @param bs a Bioseq instance + # + def add( self, bs ): + if self.idx.has_key( bs.header ): + sys.stderr.write( "ERROR: two sequences with same header '%s'\n" % ( bs.header ) ) + sys.exit(1) + self.db.append( bs ) + self.idx[ bs.header ] = len(self.db) - 1 + self.idx_renamed[ bs.header.replace("::","-").replace(":","-").replace(",","-").replace(" ","_") ] = len(self.db) - 1 + + + ## Give the Bioseq instance corresponding to specified index + # + # @return a Bioseq instance + # + def __getitem__(self,index): + if index < len(self.db): + return self.db[index] + + + ## Give the number of sequences in the bank + # + # @return an integer + # + def getSize( self ): + return len( self.db ) + + + ## Give the cumulative sequence length in the bank + # + # @return an integer + # + def getLength( self ): + cumLength = 0 + for iBioseq in self.db: + cumLength += iBioseq.getLength() + + return cumLength + + + ## Return the length of a given sequence via its header + # + # @return an integer + # + def getSeqLength( self, header ): + return self.fetch(header).getLength() + + + ## Return a list with the sequence headers + # + def getHeaderList( self ): + lHeaders = [] + for bs in self.db: + lHeaders.append( bs.header ) + return lHeaders + + + ## Return a list with the sequences + # + def getSequencesList( self ): + lSeqs = [] + for bs in self.db: + lSeqs.append( bs.getSequence() ) + return lSeqs + + + ## Give the Bioseq instance of the BioseqDB specified by its header + # + # @warning name of this method not appropriate getBioseqByHeader is proposed + # @param header string + # @return a Bioseq instance + # + def fetch( self, header ): + return self.db[self.idx[header]] + + + ## Give the Bioseq instance of the BioseqDB specified by its renamed header + # In renamed header "::", ":", "," character are been replaced by "-" and " " by "_" + # + # @param renamedHeader string + # @return a Bioseq instance + # + def getBioseqByRenamedHeader( self, renamedHeader ): + return self.db[self.idx_renamed[renamedHeader]] + + + ## Count the number of times the given nucleotide is present in the bank. + # + # @param nt character (nt or aa) + # @return an integer + # + def countNt( self, nt ): + total = 0 + for iBioseq in self.db: + total+= iBioseq.countNt( nt ) + return total + + + ## Count the number of times each nucleotide (A,T,G,C,N) is present in the bank. + # + # @return a dictionary with nucleotide as key and an integer as values + # + def countAllNt( self ): + dNt2Count = {} + for nt in ["A","T","G","C","N"]: + dNt2Count[ nt ] = self.countNt( nt ) + return dNt2Count + + + ## Extract a sub BioseqDB of specified size which beginning at specified start + # + # @param start integer index of first included Bioseq + # @param size integer size of expected BioseqDB + # @return a BioseqDB + # + def extractPart(self, start, size): + iShorterBioseqDB = BioseqDB() + for iBioseq in self.db[start:(start + size)]: + iShorterBioseqDB.add(iBioseq) + return iShorterBioseqDB + + + ## Extract a sub BioseqDB with the specified number of best length Bioseq + # + # @param numBioseq integer the number of Bioseq searched + # @return a BioseqDB + # + def bestLength(self, numBioseq): + length_list = [] + numseq = 0 + for each_seq in self.db: + if each_seq.sequence == None: + l=0 + else: + l = each_seq.getLength() + length_list.append(l) + numseq = numseq + 1 + + length_list.sort() + size = len(length_list) + if numBioseq < size: + len_min = length_list[size-numBioseq] + else: + len_min = length_list[0] + + numseq = 0 + nbsave = 0 + bestSeqs = BioseqDB() + bestSeqs.setName(self.name) + for each_seq in self.db: + if each_seq.sequence == None: + l=0 + else : + l = each_seq.getLength() + numseq = numseq + 1 + if l >= len_min: + bestSeqs.add(each_seq) + nbsave = nbsave + 1 + if nbsave == numBioseq : + break + return bestSeqs + + + ## Extract a sub BioseqDB from a file with Bioseq header containing the specified pattern + # + # @param pattern regular expression of wished Bioseq header + # @param inFileName name of fasta file in which we want extract the BioseqDB + # + def extractPatternOfFile(self, pattern, inFileName): + if pattern=="" : + return + srch=re.compile(pattern) + file_db=open(inFileName) + numseq=0 + nbsave=0 + while 1: + seq=Bioseq() + seq.read(file_db) + if seq.sequence==None: + break + numseq+=1 + m=srch.search(seq.header) + if m: + self.add(seq) + nbsave+=1 + file_db.close() + + + ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern + # + # @param pattern regular expression of wished Bioseq header + # + # @return a BioseqDB + # + def getByPattern(self,pattern): + if pattern=="" : + return + iBioseqDB=BioseqDB() + srch=re.compile(pattern) + for iBioseq in self.db: + if srch.search(iBioseq.header): + iBioseqDB.add(iBioseq) + return iBioseqDB + + + ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern + # + # @param pattern regular expression of not wished Bioseq header + # + # @return a BioseqDB + # + def getDiffFromPattern(self,pattern): + if pattern=="" : + return + iBioseqDB=BioseqDB() + srch=re.compile(pattern) + for iBioseq in self.db: + if not srch.search(iBioseq.header): + iBioseqDB.add(iBioseq) + return iBioseqDB + + #TODO: to run several times to remove all concerned sequences when big data. How to fix it ? + ## Remove from the instance all Bioseq which header contains the specified pattern + # + # @param pattern regular expression of not wished Bioseq header + # + def rmByPattern(self,pattern): + if pattern=="" : + return + srch=re.compile(pattern) + for seq in self.db: + if srch.search(seq.header): + self.db.remove(seq) + + + ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern + # + # @warning this method is called extractPattern in pyRepet.seq.BioseqDB + # + # @param pattern regular expression of wished Bioseq header + # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq + # + def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB): + if pattern=="" : + return + srch=re.compile(pattern) + for seq in sourceBioseqDB.db: + m=srch.search(seq.header) + if m: + self.add(seq) + + + ## Up-case the sequence characters in all sequences + # + def upCase( self ): + for bs in self.db: + bs.upCase() + + + ## Split each gapped Bioseq in a list and store all in a dictionary + # + # @return a dict, keys are bioseq headers, values are list of Map instances + # + def getDictOfLMapsWithoutGaps( self ): + dSeq2Maps = {} + + for bs in self.db: + dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap() + + return dSeq2Maps + + ## Give the list of the sequence length in the bank + # + # @return an list + # + def getListOfSequencesLength( self ): + lLength = [] + for iBioseq in self.db: + lLength.append(iBioseq.getLength()) + + return lLength + + ## Return sequence length for a list of sequence header + # + def getSeqLengthByListOfName( self, lHeaderName ): + lseqLength=[] + for headerName in lHeaderName: + lseqLength.append(self.getSeqLength( headerName )) + return lseqLength diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/BioseqUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/BioseqUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,296 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import math +import re +from commons.core.seq.Bioseq import Bioseq + +## Static methods for sequences manipulation +# +class BioseqUtils(object): + + ## Translate a nucleotide sequence + # + # @param bioSeqInstanceToTranslate a bioseq instance to translate + # @param phase a integer : 1 (default), 2 or 3 + # + def translateSequence(bioSeqInstanceToTranslate, phase=1): + pep = "" + #length = math.floor((len(self.sequence)-phase-1)/3)*3 + length = int( math.floor( ( len(bioSeqInstanceToTranslate.sequence )-( phase-1 ) )/3 )*3 ) + #We need capital letters ! + bioSeqInstanceToTranslate.upCase() + sequence = bioSeqInstanceToTranslate.sequence + for i in xrange(phase-1,length,3): + if (sequence[i:i+3] == "TTT" or sequence[i:i+3] == "TTC"): + pep = pep + "F" + elif ( sequence[i:i+3] == "TTA" or sequence[i:i+3] == "TTG" ): + pep = pep + "L" + elif ( sequence[i:i+2] == "CT" ): + pep = pep + "L" + elif ( sequence[i:i+3] == "ATT" or sequence[i:i+3] == "ATC" or sequence[i:i+3] == "ATA" ): + pep = pep + "I" + elif ( sequence[i:i+3] == "ATG" ): + pep = pep + "M" + elif ( sequence[i:i+2] == "GT" ): + pep = pep + "V" + elif ( sequence[i:i+2] == "TC" ) : + pep = pep + "S" + elif ( sequence[i:i+2] == "CC" ) : + pep = pep + "P" + elif ( sequence[i:i+2] == "AC" ) : + pep = pep + "T" + elif ( sequence[i:i+2] == "GC" ) : + pep = pep + "A" + elif ( sequence[i:i+3] == "TAT" or sequence[i:i+3] == "TAC" ) : + pep = pep + "Y" + elif ( sequence[i:i+3] == "TAA" or sequence[i:i+3] == "TAG" ) : + pep = pep + "*" + elif ( sequence[i:i+3] == "CAT" or sequence[i:i+3] == "CAC" ) : + pep = pep + "H" + elif ( sequence[i:i+3] == "CAA" or sequence[i:i+3] == "CAG" ) : + pep = pep + "Q" + elif ( sequence[i:i+3] == "AAT" or sequence[i:i+3] == "AAC" ) : + pep = pep + "N" + elif ( sequence[i:i+3] == "AAA" or sequence[i:i+3] == "AAG" ) : + pep = pep + "K" + elif ( sequence[i:i+3] == "GAT" or sequence[i:i+3] == "GAC" ) : + pep = pep + "D" + elif ( sequence[i:i+3] == "GAA" or sequence[i:i+3] == "GAG" ) : + pep = pep + "E" + elif ( sequence[i:i+3] == "TGT" or sequence[i:i+3] == "TGC" ) : + pep = pep + "C" + elif ( sequence[i:i+3] == "TGA" ) : + pep = pep + "*" + elif ( sequence[i:i+3] == "TGG" ) : + pep = pep + "W" + elif ( sequence[i:i+2] == "CG" ) : + pep = pep + "R" + elif ( sequence[i:i+3] == "AGT" or sequence[i:i+3] == "AGC" ) : + pep = pep + "S" + elif ( sequence[i:i+3] == "AGA" or sequence[i:i+3] == "AGG" ) : + pep = pep + "R" + elif ( sequence[i:i+2] == "GG" ): + pep = pep + "G" + #We don't know the amino acid because we don't have the nucleotide + #R Purine (A or G) + #Y Pyrimidine (C, T, or U) + #M C or A + #K T, U, or G + #W T, U, or A + #S C or G + #B C, T, U, or G (not A) + #D A, T, U, or G (not C) + #H A, T, U, or C (not G) + #V A, C, or G (not T, not U) + #N Unknown nucleotide + elif ( re.search("N|R|Y|M|K|W|S|B|D|H|V", sequence[i:i+3])): + pep = pep + "X" + bioSeqInstanceToTranslate.sequence = pep + + translateSequence = staticmethod(translateSequence) + + ## Add the frame info in header + # + # @param bioSeqInstance a bioseq instance to translate + # @param phase a integer : 1 , 2 or 3 + # + def setFrameInfoOnHeader(bioSeqInstance, phase): + if " " in bioSeqInstance.header: + name, desc = bioSeqInstance.header.split(" ", 1) + name = name + "_" + str(phase) + bioSeqInstance.header = name + " " + desc + else: + bioSeqInstance.header = bioSeqInstance.header + "_" + str(phase) + + setFrameInfoOnHeader = staticmethod(setFrameInfoOnHeader) + + ## Translate a nucleotide sequence for all frames (positives and negatives) + # + # @param bioSeqInstanceToTranslate a bioseq instance to translate + # + def translateInAllFrame( bioSeqInstanceToTranslate ): + positives = BioseqUtils._translateInPositiveFrames( bioSeqInstanceToTranslate ) + negatives = BioseqUtils._translateInNegativeFrames( bioSeqInstanceToTranslate ) + listAll6Frames = [] + listAll6Frames.extend(positives) + listAll6Frames.extend(negatives) + return listAll6Frames + + translateInAllFrame = staticmethod(translateInAllFrame) + + ## Replace the stop codons by X in sequence + # + # @param bioSeqInstance a bioseq instance + # + def replaceStopCodonsByX( bioSeqInstance ): + bioSeqInstance.sequence = bioSeqInstance.sequence.replace ("*", "X") + + replaceStopCodonsByX = staticmethod(replaceStopCodonsByX) + + ## Translate in a list all the frames of all the bioseq of bioseq list + # + # @param bioseqList a list of bioseq instances + # @return a list of translated bioseq instances + # + def translateBioseqListInAllFrames( bioseqList ): + bioseqListInAllFrames = [] + for bioseq in bioseqList : + bioseqListInAllFrames.extend(BioseqUtils.translateInAllFrame(bioseq)) + return bioseqListInAllFrames + + translateBioseqListInAllFrames = staticmethod( translateBioseqListInAllFrames ) + + ## Replace the stop codons by X for each sequence of a bioseq list + # + # @param lBioseqWithStops a list of bioseq instances + # @return a list of bioseq instances + # + def replaceStopCodonsByXInBioseqList ( lBioseqWithStops ): + bioseqListWithStopsreplaced = [] + for bioseq in lBioseqWithStops: + BioseqUtils.replaceStopCodonsByX(bioseq) + bioseqListWithStopsreplaced.append(bioseq) + return bioseqListWithStopsreplaced + + replaceStopCodonsByXInBioseqList = staticmethod( replaceStopCodonsByXInBioseqList ) + + ## Write a list of bioseq instances in a fasta file (60 characters per line) + # + # @param lBioseq a list of bioseq instances + # @param fileName string + # + def writeBioseqListIntoFastaFile( lBioseq, fileName ): + fout = open(fileName, "w") + for bioseq in lBioseq: + bioseq.write(fout) + fout.close() + + writeBioseqListIntoFastaFile = staticmethod( writeBioseqListIntoFastaFile ) + + ## read in a fasta file and create a list of bioseq instances + # + # @param fileName string + # @return a list of bioseq + # + def extractBioseqListFromFastaFile( fileName ): + file = open( fileName ) + lBioseq = [] + currentHeader = "" + while currentHeader != None: + bioseq = Bioseq() + bioseq.read(file) + currentHeader = bioseq.header + if currentHeader != None: + lBioseq.append(bioseq) + return lBioseq + + extractBioseqListFromFastaFile = staticmethod( extractBioseqListFromFastaFile ) + + ## Give the length of a sequence search by name + # + # @param lBioseq a list of bioseq instances + # @param seqName string + # @return an integer + # + def getSeqLengthWithSeqName( lBioseq, seqName ): + length = 0 + for bioseq in lBioseq: + if bioseq.header == seqName: + length = bioseq.getLength() + break + return length + + getSeqLengthWithSeqName = staticmethod( getSeqLengthWithSeqName ) + + def _translateInPositiveFrames( bioSeqInstanceToTranslate ): + seq1 = bioSeqInstanceToTranslate.copyBioseqInstance() + BioseqUtils.setFrameInfoOnHeader(seq1, 1) + BioseqUtils.translateSequence(seq1, 1) + seq2 = bioSeqInstanceToTranslate.copyBioseqInstance() + BioseqUtils.setFrameInfoOnHeader(seq2, 2) + BioseqUtils.translateSequence(seq2, 2) + seq3 = bioSeqInstanceToTranslate.copyBioseqInstance() + BioseqUtils.setFrameInfoOnHeader(seq3, 3) + BioseqUtils.translateSequence(seq3, 3) + return [seq1, seq2, seq3] + + _translateInPositiveFrames = staticmethod( _translateInPositiveFrames ) + + def _translateInNegativeFrames(bioSeqInstanceToTranslate): + seq4 = bioSeqInstanceToTranslate.copyBioseqInstance() + seq4.reverseComplement() + BioseqUtils.setFrameInfoOnHeader(seq4, 4) + BioseqUtils.translateSequence(seq4, 1) + seq5 = bioSeqInstanceToTranslate.copyBioseqInstance() + seq5.reverseComplement() + BioseqUtils.setFrameInfoOnHeader(seq5, 5) + BioseqUtils.translateSequence(seq5, 2) + seq6 = bioSeqInstanceToTranslate.copyBioseqInstance() + seq6.reverseComplement() + BioseqUtils.setFrameInfoOnHeader(seq6, 6) + BioseqUtils.translateSequence(seq6, 3) + return [seq4, seq5, seq6] + + _translateInNegativeFrames = staticmethod( _translateInNegativeFrames ) + + + ## Return a dictionary which keys are sequence headers and values sequence lengths. + # + def getLengthPerSeqFromFile( inFile ): + dHeader2Length = {} + inFileHandler = open( inFile, "r" ) + while True: + iBs = Bioseq() + iBs.read( inFileHandler ) + if iBs.sequence == None: + break + dHeader2Length[ iBs.header ] = iBs.getLength() + inFileHandler.close() + return dHeader2Length + + getLengthPerSeqFromFile = staticmethod( getLengthPerSeqFromFile ) + + + ## Return the list of Bioseq instances, these being sorted in decreasing length + # + def getBioseqListSortedByDecreasingLength( lBioseqs ): + return sorted( lBioseqs, key=lambda iBs: ( iBs.getLength() ), reverse=True ) + + getBioseqListSortedByDecreasingLength = staticmethod( getBioseqListSortedByDecreasingLength ) + + + ## Return the list of Bioseq instances, these being sorted in decreasing length (without gaps) + # + def getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ): + return sorted( lBioseqs, key=lambda iBs: ( len(iBs.sequence.replace("-","")) ), reverse=True ) + + getBioseqListSortedByDecreasingLengthWithoutGaps = staticmethod( getBioseqListSortedByDecreasingLengthWithoutGaps ) diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/ClusterConsensusCollection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/ClusterConsensusCollection.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,66 @@ +import re +from commons.core.seq.BioseqDB import BioseqDB + +## Record a collection of bioseqDB representing cluster consensus +# +class ClusterConsensusCollection(object): + + ## constructor + # + # @param clusterFileName string name of file containing the cluster of consensus + # + def __init__(self, clusterFileName): + self._clusterFileName = clusterFileName + self._lClusterConsensus = [] + + def __eq__(self, o): + return self._clusterFileName == o._clusterFileName and self._lClusterConsensus == o._lClusterConsensus + + def getLClusterConsensus(self): + return self._lClusterConsensus + + def fillCollection(self): + iBioseqDBAllCluster = BioseqDB() + fClusterFile = open(self._clusterFileName, "r") + iBioseqDBAllCluster.read(fClusterFile) + fClusterFile.close() + lHeader = iBioseqDBAllCluster.getHeaderList() + firstHeader = lHeader[0] + previousClusterName, seqHeader = self._getClusterNameAndSeqHeader(firstHeader) + clusterConsensus = BioseqDB() + clusterConsensus.setName(previousClusterName) + self._addBioseqInClusterConsensus(iBioseqDBAllCluster, firstHeader, seqHeader, clusterConsensus) + for header in lHeader[1:]: + clusterName, seqHeader = self._getClusterNameAndSeqHeader(header) + if clusterName != previousClusterName: + self._lClusterConsensus.append(clusterConsensus) + previousClusterName = clusterName + clusterConsensus = BioseqDB() + clusterConsensus.setName(previousClusterName) + self._addBioseqInClusterConsensus(iBioseqDBAllCluster, header, seqHeader, clusterConsensus) + self._lClusterConsensus.append(clusterConsensus) + + def _getClusterNameAndSeqHeader(self, header): + m = re.match("(\D*)(\d+)Mb\d+\s.*", header) + clusterNumber = m.group(2) + clusterName = m.group(1) + clusterNumber + lPartsHeaderheader = header.split(" ") + seqHeader = lPartsHeaderheader[1] + return clusterName, seqHeader + + def _addBioseqInClusterConsensus(self, iBioseqDBAllCluster, firstHeader, seqHeader, clusterConsensus): + ibioseq = iBioseqDBAllCluster.fetch(firstHeader) + ibioseq.setHeader(seqHeader) + clusterConsensus.add(ibioseq) + + def getNumClusterForAConsensus(self, seqName): + nbCluster = 1 + for bioseqDB in self._lClusterConsensus: + if seqName in bioseqDB.getHeaderList(): + return nbCluster + nbCluster += 1 + + def getNumConsensusInCluster(self, numCluster): + return self._lClusterConsensus[numCluster - 1].getSize() + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/FastaUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/FastaUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,1197 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import string +import math +import shutil +import re +import glob +from operator import itemgetter +from commons.core.seq.BioseqDB import BioseqDB +from commons.core.seq.Bioseq import Bioseq +from commons.core.coord.MapUtils import MapUtils +from commons.core.coord.Range import Range +from commons.core.checker.CheckerUtils import CheckerUtils +from commons.core.launcher.LauncherUtils import LauncherUtils +from commons.core.coord.ConvCoord import ConvCoord +from commons.core.parsing.FastaParser import FastaParser + + +## Static methods for fasta file manipulation +# +class FastaUtils( object ): + + ## Count the number of sequences in the input fasta file + # + # @param inFile name of the input fasta file + # + # @return integer number of sequences in the input fasta file + # + @staticmethod + def dbSize( inFile ): + nbSeq = 0 + inFileHandler = open( inFile, "r" ) + line = inFileHandler.readline() + while line: + if line[0] == ">": + nbSeq = nbSeq + 1 + line = inFileHandler.readline() + inFileHandler.close() + + return nbSeq + + + ## Compute the cumulative sequence length in the input fasta file + # + # @param inFile handler of the input fasta file + # + @staticmethod + def dbCumLength( inFile ): + cumLength = 0 + line = inFile.readline() + while line: + if line[0] != ">": + cumLength += len(string.rstrip(line)) + line = inFile.readline() + + return cumLength + + + ## Return a list with the length of each sequence in the input fasta file + # + # @param inFile string name of the input fasta file + # + @staticmethod + def dbLengths( inFile ): + lLengths = [] + inFileHandler = open( inFile, "r" ) + currentLength = 0 + line = inFileHandler.readline() + while line: + if line[0] == ">": + if currentLength != 0: + lLengths.append( currentLength ) + currentLength = 0 + else: + currentLength += len(line[:-1]) + line = inFileHandler.readline() + lLengths.append( currentLength ) + inFileHandler.close() + return lLengths + + + ## Retrieve the sequence headers present in the input fasta file + # + # @param inFile string name of the input fasta file + # @param verbose integer level of verbosity + # + # @return list of sequence headers + # + @staticmethod + def dbHeaders( inFile, verbose=0 ): + lHeaders = [] + + inFileHandler = open( inFile, "r" ) + line = inFileHandler.readline() + while line: + if line[0] == ">": + lHeaders.append( string.rstrip(line[1:]) ) + if verbose > 0: + print string.rstrip(line[1:]) + line = inFileHandler.readline() + inFileHandler.close() + + return lHeaders + + + ## Cut a data bank into chunks according to the input parameters + # If a sequence is shorter than the threshold, it is only renamed (not cut) + # + # @param inFileName string name of the input fasta file + # @param chkLgth string chunk length (in bp, default=200000) + # @param chkOver string chunk overlap (in bp, default=10000) + # @param wordN string N stretch word length (default=11, 0 for no detection) + # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') + # @param clean boolean remove 'cut' and 'Nstretch' files + # @param verbose integer (default = 0) + # + @staticmethod + def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ): + nbSeq = FastaUtils.dbSize( inFileName ) + if verbose > 0: + print "cut the %i input sequences with cutterDB..." % ( nbSeq ) + sys.stdout.flush() + + prg = "cutterDB" + cmd = prg + cmd += " -l %s" % ( chkLgth ) + cmd += " -o %s" %( chkOver ) + cmd += " -w %s" % ( wordN ) + cmd += " %s" % ( inFileName ) + returnStatus = os.system( cmd ) + if returnStatus != 0: + msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus ) + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + + nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) ) + if verbose > 0: + print "done (%i chunks)" % ( nbChunks ) + sys.stdout.flush() + + if verbose > 0: + print "rename the headers..." + sys.stdout.flush() + + if outFilePrefix == "": + outFastaName = inFileName + "_chunks.fa" + outMapName = inFileName + "_chunks.map" + else: + outFastaName = outFilePrefix + ".fa" + outMapName = outFilePrefix + ".map" + + inFile = open( "%s_cut" % ( inFileName ), "r" ) + line = inFile.readline() + + outFasta = open( outFastaName, "w" ) + outMap = open( outMapName, "w" ) + + # read line after line (no need for big RAM) and change the sequence headers + while line: + + if line[0] == ">": + if verbose > 1: + print "rename '%s'" % ( line[:-1] ); sys.stdout.flush() + data = line[:-1].split(" ") + seqID = data[0].split(">")[1] + newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) ) + oldHeader = data[2] + seqStart = data[4].split("..")[0] + seqEnd = data[4].split("..")[1] + outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) ) + outFasta.write( ">%s\n" % ( newHeader ) ) + + else: + outFasta.write( line.upper() ) + + line = inFile.readline() + + inFile.close() + outFasta.close() + outMap.close() + + if clean == True: + os.remove(inFileName + "_cut") + os.remove(inFileName + ".Nstretch.map") + + + ## Split the input fasta file in several output files + # + # @param inFile string name of the input fasta file + # @param nbSeqPerBatch integer number of sequences per output file + # @param newDir boolean put the sequences in a new directory called 'batches' + # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') + # @param prefix prefix in output file name + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ): + if not os.path.exists( inFile ): + msg = "ERROR: file '%s' doesn't exist" % ( inFile ) + sys.stderr.write( "%s\n" % ( msg ) ) + sys.exit(1) + + nbSeq = FastaUtils.dbSize( inFile ) + + nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) ) + if verbose > 0: + print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches ) + sys.stdout.flush() + + if nbSeqPerBatch > 1 and useSeqHeader: + useSeqHeader = False + + if newDir == True: + if os.path.exists( "batches" ): + shutil.rmtree( "batches" ) + os.mkdir( "batches" ) + os.chdir( "batches" ) + os.system( "ln -s ../%s ." % ( inFile ) ) + + inFileHandler = open( inFile, "r" ) + inFileHandler.seek( 0, 0 ) + countBatch = 0 + countSeq = 0 + line = inFileHandler.readline() + while line: + if line == "": + break + if line[0] == ">": + countSeq += 1 + if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: + if "outFile" in locals(): + outFile.close() + countBatch += 1 + if nbSeqPerBatch == 1 and useSeqHeader: + outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") ) + else: + outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) ) + outFile = open( outFileName, "w" ) + if verbose > 1: + print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName ) + sys.stdout.flush() + outFile.write( line ) + line = inFileHandler.readline() + inFileHandler.close() + + if newDir == True: + os.remove( os.path.basename( inFile ) ) + os.chdir( ".." ) + + + ## Split the input fasta file in several output files + # + # @param inFileName string name of the input fasta file + # @param maxSize integer max cumulative length for each output file + # + @staticmethod + def splitFastaFileInBatches(inFileName, maxSize = 200000): + iBioseqDB = BioseqDB(inFileName) + lHeadersSizeTuples = [] + for iBioseq in iBioseqDB.db: + lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) + + lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) + os.mkdir("batches") + os.chdir("batches") + + iterator = 0 + for lHeader in lHeadersList : + iterator += 1 + with open("batch_%s.fa" % iterator, 'w') as f : + for header in lHeader : + iBioseq = iBioseqDB.fetch(header) + iBioseq.write(f) + os.chdir("..") + + + ## Split the input fasta file in several output files according to their cluster identifier + # + # @param inFileName string name of the input fasta file + # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) + # @param simplifyHeader boolean simplify the headers + # @param createDir boolean put the sequences in different directories + # @param outPrefix string prefix of the output files (default='seqCluster') + # @param verbose integer (default = 0) + # + @staticmethod + def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ): + if not os.path.exists( inFileName ): + print "ERROR: %s doesn't exist" % ( inFileName ) + sys.exit(1) + + inFile = open( inFileName, "r" ) + + line = inFile.readline() + if line: + name = line.split(" ")[0] + if "Cluster" in name: + clusterID = name.split("Cluster")[1].split("Mb")[0] + seqID = name.split("Mb")[1] + else: + clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust + if "Q" in name.split("Gr")[0]: + seqID = name.split("Gr")[0].split("MbQ")[1] + elif "S" in name: + seqID = name.split("Gr")[0].split("MbS")[1] + sClusterIDs = set( [ clusterID ] ) + if simplifyHeader == True: + header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) + else: + header = line[1:-1] + if createDir == True: + if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): + os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + outFileName = "%s%s.fa" % ( outPrefix, clusterID ) + outFile = open( outFileName, "w" ) + outFile.write( ">%s\n" % ( header ) ) + prevClusterID = clusterID + + line = inFile.readline() + while line: + if line[0] == ">": + name = line.split(" ")[0] + if "Cluster" in name: + clusterID = name.split("Cluster")[1].split("Mb")[0] + seqID = name.split("Mb")[1] + else: + clusterID = name.split("Cl")[0].split("Gr")[1] + if "Q" in name.split("Gr")[0]: + seqID = name.split("Gr")[0].split("MbQ")[1] + elif "S" in name: + seqID = name.split("Gr")[0].split("MbS")[1] + + if clusterID != prevClusterID: + outFile.close() + + if simplifyHeader == True: + header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) + else: + header = line[1:-1] + + if createDir == True: + os.chdir( ".." ) + if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): + os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) + + outFileName = "%s%s.fa" % ( outPrefix, clusterID ) + if not os.path.exists( outFileName ): + outFile = open( outFileName, "w" ) + else: + if clusterID != prevClusterID: + outFile.close() + outFile = open( outFileName, "a" ) + outFile.write( ">%s\n" % ( header ) ) + prevClusterID = clusterID + sClusterIDs.add( clusterID ) + + else: + outFile.write( line ) + + line = inFile.readline() + + outFile.close() + if verbose > 0: + print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() + + if createDir == True: + os.chdir("..") + else: + print "WARNING: empty input file - no cluster found"; sys.stdout.flush() + + + ## Filter a fasta file in two fasta files using the length of each sequence as a criteron + # + # @param len_min integer length sequence criterion to filter + # @param inFileName string name of the input fasta file + # @param verbose integer (default = 0) + # + @staticmethod + def dbLengthFilter( len_min, inFileName, verbose=0 ): + file_db = open( inFileName, "r" ) + file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) + file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) + seq = Bioseq() + numseq = 0 + nbsave = 0 + + seq.read( file_db ) + while seq.sequence: + l = seq.getLength() + numseq = numseq + 1 + if l >= len_min: + seq.write( file_dbSup ) + if verbose > 0: + print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' + nbsave=nbsave+1 + else: + seq.write( file_dbInf ) + if verbose > 0: + print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' + nbsave=nbsave+1 + seq.read( file_db ) + + file_db.close() + file_dbInf.close() + file_dbSup.close() + if verbose > 0: + print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min) + + + ## Extract the longest sequences from a fasta file + # + # @param num integer maximum number of sequences in the output file + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output fasta file + # @param minThresh integer minimum length threshold (default=0) + # @param verbose integer (default = 0) + # + @staticmethod + def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ): + bsDB = BioseqDB( inFileName ) + if verbose > 0: + print "nb of input sequences: %i" % ( bsDB.getSize() ) + + if outFileName == "": + outFileName = inFileName + ".best" + str(num) + outFile = open( outFileName, "w" ) + + if bsDB.getSize()==0: + return 0 + + num = int(num) + if verbose > 0: + print "keep the %i longest sequences" % ( num ) + if minThresh > 0: + print "with length > %i bp" % ( minThresh ) + sys.stdout.flush() + + # retrieve the length of each input sequence + tmpLSeqLgth = [] + seqNum = 0 + for bs in bsDB.db: + seqNum += 1 + tmpLSeqLgth.append( bs.getLength() ) + if verbose > 1: + print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() ) + sys.stdout.flush() + + # sort the lengths + tmpLSeqLgth.sort() + tmpLSeqLgth.reverse() + + # select the longest + lSeqLgth = [] + for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): + if tmpLSeqLgth[i] >= minThresh: + lSeqLgth.append( tmpLSeqLgth[i] ) + if verbose > 0: + print "selected max length: %i" % ( max(lSeqLgth) ) + print "selected min length: %i" % ( min(lSeqLgth) ) + sys.stdout.flush() + + # save the longest + inFile = open( inFileName ) + seqNum = 0 + nbSave = 0 + for bs in bsDB.db: + seqNum += 1 + if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: + bs.write( outFile ) + if verbose > 1: + print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) + sys.stdout.flush() + nbSave += 1 + if nbSave == num: + break + inFile.close() + outFile.close() + if verbose > 0: + print nbSave, "saved sequences in ", outFileName + sys.stdout.flush() + + return 0 + + + ## Extract all the sequence headers from a fasta file and write them in a new fasta file + # + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') + # + @staticmethod + def dbExtractSeqHeaders( inFileName, outFileName="" ): + lHeaders = FastaUtils.dbHeaders( inFileName ) + + if outFileName == "": + outFileName = inFileName + ".headers" + + outFile = open( outFileName, "w" ) + for i in lHeaders: + outFile.write( i + "\n" ) + outFile.close() + + return 0 + + + ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file + # + # @param pattern regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ): + if pattern == "": + return + + if outFileName == "": + outFileName = inFileName + '.extracted' + outFile = open( outFileName, 'w' ) + + patternTosearch = re.compile( pattern ) + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + inFile = open( inFileName, "r" ) + bioseq.read( inFile ) + while bioseq.sequence: + bioseqNb = bioseqNb + 1 + m = patternTosearch.search( bioseq.header ) + if m: + bioseq.write( outFile ) + if verbose > 1: + print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!' + savedBioseqNb = savedBioseqNb + 1 + bioseq.read( inFile ) + inFile.close() + + outFile.close() + + if verbose > 0: + print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) + + + ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file + # + # @param patternFileName string file containing regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): + + if patternFileName == "": + print "ERROR: no file of pattern" + sys.exit(1) + + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + lHeaders = [] + + inFile = open( inFileName, "r" ) + bioseq.read( inFile ) + while bioseq.sequence != None: + lHeaders.append( bioseq.header ) + bioseq.read( inFile ) + inFile.close() + + lHeadersToKeep = [] + patternFile = open( patternFileName, "r" ) + for pattern in patternFile: + if verbose > 0: + print "pattern: ",pattern[:-1]; sys.stdout.flush() + + patternToSearch = re.compile(pattern[:-1]) + for h in lHeaders: + if patternToSearch.search(h): + lHeadersToKeep.append(h) + patternFile.close() + + if outFileName == "": + outFileName = inFileName + ".extracted" + outFile=open( outFileName, "w" ) + + inFile = open( inFileName, "r" ) + bioseq.read(inFile) + while bioseq.sequence: + bioseqNb += 1 + if bioseq.header in lHeadersToKeep: + bioseq.write(outFile) + if verbose > 1: + print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() + savedBioseqNb += 1 + bioseq.read(inFile) + inFile.close() + + outFile.close() + + if verbose > 0: + print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) + + + ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file + # + # @param pattern regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ): + if pattern == "": + return + + patternToSearch = re.compile(pattern) + + if outFileName == "": + outFileName = inFileName + '.cleaned' + outFile = open(outFileName,'w') + + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + inFile = open(inFileName) + bioseq.read(inFile) + while bioseq.sequence != None: + bioseqNb += 1 + if not patternToSearch.search(bioseq.header): + bioseq.write(outFile) + if verbose > 1: + print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!' + savedBioseqNb += 1 + bioseq.read(inFile) + inFile.close() + + outFile.close() + + if verbose > 0: + print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) + + + ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file + # + # @param patternFileName string file containing regular expression to search in headers + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): + if patternFileName == "": + print "ERROR: no file of pattern" + sys.exit(1) + + bioseq = Bioseq() + bioseqNb = 0 + savedBioseqNb = 0 + lHeaders = [] + inFile = open( inFileName, "r" ) + bioseq.read( inFile ) + while bioseq.sequence != None: + bioseqNb += 1 + lHeaders.append( bioseq.header ) + bioseq.read( inFile ) + inFile.close() + + patternFile = open( patternFileName, "r") + lHeadersToRemove = [] + for pattern in patternFile: + if verbose > 0: + print "pattern: ",pattern[:-1]; sys.stdout.flush() + + patternToSearch = re.compile( pattern[:-1] ) + for h in lHeaders: + if patternToSearch.search(h): + lHeadersToRemove.append(h) + patternFile.close() + + if outFileName == "": + outFileName = inFileName + '.cleaned' + outFile = open( outFileName, 'w' ) + + bioseqNum = 0 + inFile=open( inFileName ) + bioseq.read( inFile ) + while bioseq.sequence != None: + bioseqNum += 1 + if bioseq.header not in lHeadersToRemove: + bioseq.write( outFile ) + if verbose > 1: + print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() + savedBioseqNb += 1 + bioseq.read( inFile ) + inFile.close() + + outFile.close() + + if verbose > 0: + print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) + + + ## Find sequence's ORFs from a fasta file and write them in a tab file + # + # @param inFileName string name of the input fasta file + # @param orfMaxNb integer Select orfMaxNb best ORFs + # @param orfMinLength integer Keep ORFs with length > orfMinLength + # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') + # @param verbose integer verbosity level (default = 0) + # + @staticmethod + def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ): + if outFileName == "": + outFileName = inFileName + ".ORF.map" + outFile = open( outFileName, "w" ) + + bioseq = Bioseq() + bioseqNb = 0 + + inFile = open( inFileName ) + bioseq.read( inFile ) + while bioseq.sequence != None: + bioseq.upCase() + bioseqNb += 1 + if verbose > 0: + print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]' + + orf = bioseq.findORF() + bestOrf = [] + for i in orf.keys(): + orfLen = len(orf[i]) + for j in xrange(1, orfLen): + start = orf[i][j-1] + 4 + end = orf[i][j] + 3 + if end - start >= orfMinLength: + bestOrf.append( ( end-start, i+1, start, end ) ) + + bioseq.reverseComplement() + + orf = bioseq.findORF() + seqLen = bioseq.getLength() + for i in orf.keys(): + orfLen = len(orf[i]) + for j in xrange(1, orfLen): + start = seqLen - orf[i][j-1] - 3 + end = seqLen - orf[i][j] - 2 + if start - end >= orfMinLength: + bestOrf.append( ( start-end, (i+1)*-1, start, end ) ) + + bestOrf.sort() + bestOrf.reverse() + bestOrfNb = len(bestOrf) + if orfMaxNb != 0 and orfMaxNb < bestOrfNb: + bestOrfNb = orfMaxNb + for i in xrange(0, bestOrfNb): + if verbose > 0: + print bestOrf[i] + outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\ + "|"+str(bestOrf[i][0]),bioseq.header, + bestOrf[i][2],bestOrf[i][3])) + bioseq.read( inFile ) + + inFile.close() + outFile.close() + + return 0 + + + ## Sort sequences by increasing length (write a new file) + # + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output fasta file + # @param verbose integer verbosity level + # + @staticmethod + def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): + if verbose > 0: + print "sort sequences by increasing length" + sys.stdout.flush() + if not os.path.exists( inFileName ): + print "ERROR: file '%s' doesn't exist" % ( inFileName ) + sys.exit(1) + + # read each seq one by one + # save them in distinct temporary files + # with their length in the name + inFileHandler = open( inFileName, "r" ) + countSeq = 0 + bs = Bioseq() + bs.read( inFileHandler ) + while bs.header: + countSeq += 1 + tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) + bs.appendBioseqInFile( tmpFile ) + if verbose > 1: + print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) + bs.header = "" + bs.sequence = "" + bs.read( inFileHandler ) + inFileHandler.close() + + # sort temporary file names + # concatenate them into the output file + if os.path.exists( outFileName ): + os.remove( outFileName ) + lFiles = glob.glob( "*bp_*nb" ) + lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) + for fileName in lFiles: + cmd = "cat %s >> %s" % ( fileName, outFileName ) + returnValue = os.system( cmd ) + if returnValue != 0: + print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) + sys.exit(1) + os.remove( fileName ) + + return 0 + + + ## Sort sequences by header + # + # @param inFileName string name of the input fasta file + # @param outFileName string name of the output fasta file + # @param verbose integer verbosity level + # + @staticmethod + def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0): + if outFileName == "": + outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] + iBioseqDB = BioseqDB(inFileName) + f = open(outFileName, "w") + lHeaders = sorted(iBioseqDB.getHeaderList()) + for header in lHeaders: + iBioseq = iBioseqDB.fetch(header) + iBioseq.write(f) + f.close() + + + ## Return a dictionary which keys are the headers and values the length of the sequences + # + # @param inFile string name of the input fasta file + # @param verbose integer verbosity level + # + @staticmethod + def getLengthPerHeader( inFile, verbose=0 ): + dHeader2Length = {} + + inFileHandler = open( inFile, "r" ) + currentSeqHeader = "" + currentSeqLength = 0 + line = inFileHandler.readline() + while line: + if line[0] == ">": + if currentSeqHeader != "": + dHeader2Length[ currentSeqHeader ] = currentSeqLength + currentSeqLength = 0 + currentSeqHeader = line[1:-1] + if verbose > 0: + print "current header: %s" % ( currentSeqHeader ) + sys.stdout.flush() + else: + currentSeqLength += len( line.replace("\n","") ) + line = inFileHandler.readline() + dHeader2Length[ currentSeqHeader ] = currentSeqLength + inFileHandler.close() + + return dHeader2Length + + + ## Convert headers from a fasta file having chunk coordinates + # + # @param inFile string name of the input fasta file + # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes + # @param outFile string name of the output file + # + @staticmethod + def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): + inFileHandler = open(inFile, "r") + outFileHandler = open(outFile, "w") + dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) + iConvCoord = ConvCoord() + line = inFileHandler.readline() + while line: + if line[0] == ">": + if "{Fragment}" in line: + chkName = line.split(" ")[1] + chrName = dChunk2Map[chkName].seqname + lCoordPairs = line.split(" ")[3].split(",") + lRangesOnChk = [] + for i in lCoordPairs: + iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) + lRangesOnChk.append(iRange) + lRangesOnChr = [] + for iRange in lRangesOnChk: + lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) + newHeader = line[1:-1].split(" ")[0] + newHeader += " %s" % chrName + newHeader += " {Fragment}" + newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) + for iRange in lRangesOnChr[1:]: + newHeader += ",%i..%i" % (iRange.start, iRange.end) + outFileHandler.write(">%s\n" % newHeader) + else: + chkName = line.split("_")[1].split(" ")[0] + chrName = dChunk2Map[chkName].seqname + coords = line[line.find("[")+1 : line.find("]")] + start = int(coords.split(",")[0]) + end = int(coords.split(",")[1]) + iRangeOnChk = Range(chkName, start, end) + iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) + newHeader = line[1:-1].split("_")[0] + newHeader += " %s" % chrName + newHeader += " %s" % line[line.find("(") : line.find(")")+1] + newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) + outFileHandler.write(">%s\n" % newHeader) + else: + outFileHandler.write(line) + line = inFileHandler.readline() + inFileHandler.close() + outFileHandler.close() + + + ## Convert a fasta file to a length file + # + # @param inFile string name of the input fasta file + # @param outFile string name of the output file + # + @staticmethod + def convertFastaToLength(inFile, outFile = ""): + if outFile == "": + outFile = "%s.length" % inFile + + if inFile != "": + with open(inFile, "r") as inFH: + with open(outFile, "w") as outFH: + bioseq = Bioseq() + bioseq.read(inFH) + while bioseq.sequence != None: + seqLen = bioseq.getLength() + outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) + bioseq.read(inFH) + + + ## Convert a fasta file to a seq file + # + # @param inFile string name of the input fasta file + # @param outFile string name of the output file + # + @staticmethod + def convertFastaToSeq(inFile, outFile = ""): + if outFile == "": + outFile = "%s.seq" % inFile + + if inFile != "": + with open(inFile, "r") as inFH: + with open(outFile, "w") as outFH: + bioseq = Bioseq() + bioseq.read(inFH) + while bioseq.sequence != None: + seqLen = bioseq.getLength() + outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ + bioseq.sequence, bioseq.header, seqLen)) + bioseq.read(inFH) + + + ## Splice an input fasta file using coordinates in a Map file + # + # @note the coordinates should be merged beforehand! + # + @staticmethod + def spliceFromCoords( genomeFile, coordFile, obsFile ): + genomeFileHandler = open( genomeFile, "r" ) + obsFileHandler = open( obsFile, "w" ) + dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) + + bs = Bioseq() + bs.read( genomeFileHandler ) + while bs.sequence: + if dChr2Maps.has_key( bs.header ): + lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) + splicedSeq = "" + currentSite = 0 + for iMap in lCoords: + minSplice = iMap.getMin() - 1 + if minSplice > currentSite: + splicedSeq += bs.sequence[ currentSite : minSplice ] + currentSite = iMap.getMax() + splicedSeq += bs.sequence[ currentSite : ] + bs.sequence = splicedSeq + bs.write( obsFileHandler ) + bs.read( genomeFileHandler ) + + genomeFileHandler.close() + obsFileHandler.close() + + + ## Shuffle input sequences (single file or files in a directory) + # + @staticmethod + def dbShuffle( inData, outData, verbose=0 ): + if CheckerUtils.isExecutableInUserPath("esl-shuffle"): + prg = "esl-shuffle" + else : prg = "shuffle" + genericCmd = prg + " -d INPUT > OUTPUT" + if os.path.isfile( inData ): + if verbose > 0: + print "shuffle input file '%s'" % inData + cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData) + print cmd + returnStatus = os.system( cmd ) + if returnStatus != 0: + sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) + sys.exit(1) + + elif os.path.isdir( inData ): + if verbose > 0: + print "shuffle files in input directory '%s'" % inData + if os.path.exists( outData ): + shutil.rmtree( outData ) + os.mkdir( outData ) + lInputFiles = glob.glob( "%s/*.fa" %( inData ) ) + nbFastaFiles = 0 + for inputFile in lInputFiles: + nbFastaFiles += 1 + if verbose > 1: + print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) ) + fastaBaseName = os.path.basename( inputFile ) + prefix, extension = os.path.splitext( fastaBaseName ) + cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix)) + returnStatus = os.system( cmd ) + if returnStatus != 0: + sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) + sys.exit(1) + + + ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers + # + # @param inClusterFileName string input cluster file name + # @param inFastaFileName string input fasta file name + # @param outFileName string output file name + # @param verbosity integer verbosity + # + @staticmethod + def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): + dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) + iFastaParser = FastaParser(inFastaFileName) + with open(outFileName, "w") as f: + for iSequence in iFastaParser.getIterator(): + + header = iSequence.getName() + if dHeader2ClusterClusterMember.get(header): + cluster = dHeader2ClusterClusterMember[header][0] + member = dHeader2ClusterClusterMember[header][1] + else: + clusterIdForSingletonCluster += 1 + cluster = clusterIdForSingletonCluster + member = 1 + + newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) + iSequence.setName(newHeader) + f.write(iSequence.printFasta()) + + @staticmethod + def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): + dHeader2ClusterClusterMember = {} + clusterId = 0 + with open(inClusterFileName) as f: + line = f.readline() + while line: + lineWithoutLastChar = line.rstrip() + lHeaders = lineWithoutLastChar.split("\t") + clusterId += 1 + if verbosity > 0: + print "%i sequences in cluster %i" % (len(lHeaders), clusterId) + memberId = 0 + for header in lHeaders: + memberId += 1 + dHeader2ClusterClusterMember[header] = (clusterId, memberId) + line = f.readline() + if verbosity > 0: + print "%i clusters" % clusterId + return dHeader2ClusterClusterMember, clusterId + + @staticmethod + def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): + """ + Write a map file from fasta output of clustering tool. + Warning: only works if input fasta headers are formated like LTRharvest fasta output. + """ + if not outMapFileName: + outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) + + fileDb = open(fastaFileNameFromClustering , "r") + fileMap = open(outMapFileName, "w") + seq = Bioseq() + numseq = 0 + while 1: + seq.read(fileDb) + if seq.sequence == None: + break + numseq = numseq + 1 + ID = seq.header.split(' ')[0].split('_')[0] + chunk = seq.header.split(' ')[0].split('_')[1] + start = seq.header.split(' ')[-1].split(',')[0][1:] + end = seq.header.split(' ')[-1].split(',')[1][:-1] + line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) + fileMap.write(line + "\n") + + fileDb.close() + fileMap.close() + print "saved in %s" % outMapFileName + + @staticmethod + def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): + outFormat = outFormat.lower() + if outFormat in ["gff", "gff3"]: + outFormat = "gff3" + else: + outFormat = "map" + + lTNstretches = [] + if nbN != 0: + iBSDB = BioseqDB(fastaFileName) + for iBS in iBSDB.db: + nbNFound = 0 + start = 1 + pos = 1 + lastPos = 0 + + while pos <= iBS.getLength(): + if nbNFound == 0: + start = pos + + while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: + nbNFound += 1 + pos += 1 + lastPos = pos + + if pos - lastPos >= nbN: + if nbNFound >= nbN: + lTNstretches.append((iBS.getHeader(), start, lastPos - 1)) + nbNFound = 0 + pos += 1 + + if nbNFound >= nbN: + lTNstretches.append((iBS.getHeader(), start, lastPos - 1)) + + lTNstretches.sort(key = itemgetter(0, 1, 2)) + + if outFileName == "": + outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) + + with open(outFileName, "w") as fH: + if outFormat == "gff3": + fH.write("##gff-version 3\n") + for item in lTNstretches: + seq = item[0] + start = item[1] + end = item[2] + if outFormat == "gff3": + fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) + else: + fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end)) + + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/seq/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/utils/FileUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/FileUtils.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,445 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import glob +import shutil +import sys +import re +import math +try: + import hashlib +except: + pass + + +class FileUtils( object ): + + ## Return the number of lines in the given file + # + def getNbLinesInSingleFile( fileName ): + fileHandler = open( fileName, "r" ) + lines = fileHandler.readlines() + fileHandler.close() + if (len(lines)>0 and lines[-1]== "\n"): + return (len(lines)-1) + else : + return len(lines) + + getNbLinesInSingleFile = staticmethod( getNbLinesInSingleFile ) + + ## Return the number of lines in the files in the given list + # + def getNbLinesInFileList( lFileNames ): + count = 0 + for fileName in lFileNames: + count += FileUtils.getNbLinesInSingleFile( fileName ) + return count + + getNbLinesInFileList = staticmethod( getNbLinesInFileList ) + + ## Return True if the given file exists, False otherwise + # + def isRessourceExists( fileName ): + return os.path.exists( fileName ) + + isRessourceExists = staticmethod( isRessourceExists ) + + ## Return True if the given file is empty, False otherwise + # + def isEmpty( fileName ): + return 0 == FileUtils.getNbLinesInSingleFile( fileName ) + + isEmpty = staticmethod( isEmpty ) + + ## Return True if both files are identical, False otherwise + # + def are2FilesIdentical( file1, file2 ): + tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) ) + cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile ) + returnStatus = os.system( cmd ) + if returnStatus != 0: + print "WARNING: 'diff' returned '%i'" % returnStatus + os.remove( tmpFile ) + return False + if FileUtils.isEmpty( tmpFile ): + os.remove( tmpFile ) + return True + else: + os.remove( tmpFile ) + return False + + are2FilesIdentical = staticmethod( are2FilesIdentical ) + + ## Return a string with all the content of the files in the given list + # + def getFileContent( lFiles ): + content = "" + lFiles.sort() + for fileName in lFiles: + currentFile = open( fileName, "r" ) + content += currentFile.read() + currentFile.close() + return content + + getFileContent = staticmethod( getFileContent ) + + ## Save content of the given file after having sorted it + # + def sortFileContent( inFile, outFile="" ): + inFileHandler = open(inFile, "r" ) + lines = inFileHandler.readlines() + inFileHandler.close() + lines.sort() + if outFile == "": + outFile = inFile + outFileHandler = open( outFile, "w" ) + outFileHandler.writelines( lines ) + outFileHandler.close() + + sortFileContent = staticmethod( sortFileContent ) + + ## Add end-of-line symbol to the given file content if necessary + # + def addNewLineAtTheEndOfFileContent( fileContent ): + if not fileContent.endswith('\n') and len(fileContent) != 0: + fileContent += '\n' + return fileContent + + addNewLineAtTheEndOfFileContent = staticmethod( addNewLineAtTheEndOfFileContent ) + + ## Concatenate files in the given list + # + def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ): + if sort: + lFiles.sort() + outFileHandler = open( outFile, "a" ) + isFirstFile = True + for singleFile in lFiles: + if not isFirstFile: + outFileHandler.write(separator) + isFirstFile = False + singleFileHandler = open( singleFile, "r" ) + if skipHeaders: + singleFileHandler.readline() + line = singleFileHandler.readline() + while line: + outFileHandler.write(line) + line = singleFileHandler.readline() + singleFileHandler.close() + outFileHandler.close() + + catFilesFromList = staticmethod( catFilesFromList ) + + ## Concatenate files according to the given pattern + # + def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ): + lFiles = glob.glob( pattern ) + FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator ) + + catFilesByPattern = staticmethod( catFilesByPattern ) + + ## Remove files listed according to the given pattern + # + # @example prefix="/home/tmp/dummy*.txt" + # + def removeFilesByPattern( prefix ): + lFiles = glob.glob( prefix ) + for f in lFiles: + os.remove( f ) + + removeFilesByPattern = staticmethod( removeFilesByPattern ) + + ## Remove files listed according to the suffixes in the given list + # + def removeFilesBySuffixList( targetPath, lSuffixes ): + if targetPath[-1] == "/": + targetPath = targetPath[:-1] + for suffix in lSuffixes: + pattern = "%s/*%s" % ( targetPath, suffix ) + FileUtils.removeFilesByPattern( pattern ) + + removeFilesBySuffixList = staticmethod( removeFilesBySuffixList ) + + ## Remove repeated blanks in the given file + # + def removeRepeatedBlanks( inFile, outFile="" ): + if outFile == "": + outFile = inFile + tmpFile = "tr_%s_%s" % ( inFile, outFile ) + cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile ) + os.system( cmd ) + os.rename( tmpFile, outFile ) + + removeRepeatedBlanks = staticmethod( removeRepeatedBlanks ) + + ## Remove files in the given list + # + @staticmethod + def removeFilesFromList(lFiles): + for f in lFiles: + os.remove(f) + + ## Remove files in the given list if exist + # + @staticmethod + def removeFilesFromListIfExist(lFiles): + for fileName in lFiles: + if FileUtils.isRessourceExists(fileName): + os.remove(fileName) + + ## Append the content of a file to another file + # + # @param inFile string name of the input file + # @param outFile string name of the output file + # + def appendFileContent( inFile, outFile ): + outFileHandler = open( outFile, "a" ) + inFileHandler = open( inFile, "r" ) + shutil.copyfileobj( inFileHandler, outFileHandler ) + inFileHandler.close() + outFileHandler.close() + + appendFileContent = staticmethod( appendFileContent ) + + + ## Replace Windows end-of-line by Unix end-of-line + # + def fromWindowsToUnixEof( inFile ): + tmpFile = "%s.tmp" % ( inFile ) + shutil.copyfile( inFile, tmpFile ) + os.remove( inFile ) + tmpFileHandler = open( tmpFile, "r" ) + inFileHandler = open( inFile, "w" ) + while True: + line = tmpFileHandler.readline() + if line == "": + break + inFileHandler.write( line.replace("\r\n","\n") ) + tmpFileHandler.close() + inFileHandler.close() + os.remove( tmpFile ) + + fromWindowsToUnixEof = staticmethod( fromWindowsToUnixEof ) + + + ## Remove duplicated lines in a file + # + # @note it preserves the initial order and handles blank lines + # + def removeDuplicatedLines( inFile ): + tmpFile = "%s.tmp" % ( inFile ) + shutil.copyfile( inFile, tmpFile ) + os.remove( inFile ) + + tmpFileHandler = open( tmpFile, "r" ) + lLines = list( tmpFileHandler.read().split("\n") ) + if lLines[-1] == "": + del lLines[-1] + sLines = set( lLines ) + tmpFileHandler.close() + os.remove( tmpFile ) + + inFileHandler = open( inFile, "w" ) + for line in lLines: + if line in sLines: + inFileHandler.write( "%s\n" % ( line ) ) + sLines.remove( line ) + inFileHandler.close() + + removeDuplicatedLines = staticmethod( removeDuplicatedLines ) + + + ## Write a list of lines in a given file + # + def writeLineListInFile( inFile, lLines ): + inFileHandler = open( inFile, "w" ) + for line in lLines: + inFileHandler.write( line ) + inFileHandler.close() + + writeLineListInFile = staticmethod( writeLineListInFile ) + + + ## Give the list of absolute path of each directory in the given directory + # + # @param rootPath string absolute path of the given directory + # + # @return lDirPath list of absolute directory path + # + def getAbsoluteDirectoryPathList(rootPath): + lDirPath = [] + lPaths = glob.glob(rootPath + "/*") + for ressource in lPaths: + if os.path.isdir(ressource) : + lDirPath.append(ressource) + return lDirPath + + getAbsoluteDirectoryPathList = staticmethod(getAbsoluteDirectoryPathList) + + + ## Get a sublist of which each element matches/doesn't match a pattern + # + # @param lPath string list of paths + # + # @param pattern string pattern + # + # @param match bool + # + # @return lPathMatching list of path matching pattern + # + def getSubListAccordingToPattern(lPath, pattern, match = True): + lPathMatching = [] + for path in lPath: + if match: + if re.match(".*%s.*" % pattern, path): + lPathMatching.append(path) + else: + if not re.match(".*%s.*" % pattern, path): + lPathMatching.append(path) + return lPathMatching + + getSubListAccordingToPattern = staticmethod(getSubListAccordingToPattern) + + + ## Give the list of file names found in the given directory + # + # @param dirPath string absolute path of the given directory + # + # @return lFilesInDir list of file names + # + def getFileNamesList( dirPath, patternFileFilter = ".*" ): + lFilesInDir = [] + lPaths = glob.glob( dirPath + "/*" ) + for ressource in lPaths: + if os.path.isfile( ressource ): + fileName = os.path.basename( ressource ) + if re.match(patternFileFilter, fileName): + lFilesInDir.append( fileName ) + return lFilesInDir + + getFileNamesList = staticmethod( getFileNamesList ) + + ## Return the MD5 sum of a file + # + def getMd5SecureHash( inFile ): + if "hashlib" in sys.modules: + md5 = hashlib.md5() + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + md5.update( line ) + inFileHandler.close() + return md5.hexdigest() + else: + return "" + + getMd5SecureHash = staticmethod( getMd5SecureHash ) + + ## Cat all files of a given directory + # + # @param dir string directory name + # @param outFileName string output file name + # + def catFilesOfDir(dir, outFileName): + lFiles = FileUtils.getFileNamesList(dir) + lFile2 = [] + for file in lFiles: + lFile2.append(dir + "/" + file) + FileUtils.catFilesFromList(lFile2, outFileName) + + catFilesOfDir = staticmethod(catFilesOfDir) + + ## Return True if size file > 0 octet + # + # @param fileName string file name + # + def isSizeNotNull(fileName): + size = os.path.getsize(fileName) + if size > 0: + return True + return False + + isSizeNotNull = staticmethod(isSizeNotNull) + + ## Split one file into N Files by lines + # + # @param fileName string file name + # @param N int number of files to create + # + @staticmethod + def splitFileIntoNFiles(fileName, N): + nbLine = FileUtils.getNbLinesInSingleFile(fileName) + nbLinesInEachFile = nbLine + if N > nbLine: + N = nbLine + if N != 0: + nbLinesInEachFile = math.ceil(float(nbLine) / N) + else: + N = 1 + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + fileHandler = open(fileName, "r") + for i in range(1,N+1): + with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f: + j = 0 + while j < nbLinesInEachFile: + j += 1 + f.write(fileHandler.readline()) + fileHandler.close() + + ## Split one file into files of N lines + # + # @param fileName string input file name + # @param N int lines number per files + # + @staticmethod + def splitFileAccordingToLineNumber(fileName, N): + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + with open(fileName) as inF: + fileNb = 1 + line = inF.readline() + if not line or N == 0: + outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) + f = open(outFileName, "wb") + shutil.copyfileobj(open(fileName, "rb"), f) + f.close() + else: + while line: + outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) + with open(outFileName, "w") as outF: + lineNb = 1 + while lineNb <= N and line: + outF.write(line) + line = inF.readline() + lineNb += 1 + fileNb += 1 \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/utils/PipelineStepFTests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/PipelineStepFTests.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import sys +import os +import shutil +from commons.core.utils.FileUtils import FileUtils + +class PipelineStepFTests(object): + + def __init__(self, pipelineName, packageDir, workingDir, projectName, config = "", clean = True): + self._pipelineName = pipelineName + self._packageDir = packageDir + self._workingDir = workingDir + self._projectName = projectName + self._clean = clean + self._configFileName = config + + def run(self): + self.launchStep() + self.assertStep() + +# def replaceInFile(self, fileName, oldPattern, newPattern, newFileName = ""): +# if newFileName == "": +# newFileName = "%s.new" % fileName +# f = open(newFileName, "w") +# for line in fileinput.input(fileName, inplace=1): +# newLine = line.replace(oldPattern, newPattern) +# f.write(newLine) +# f.close() +# fileinput.close() + + def _checkIfFileExist(self, fileName): + if not FileUtils.isRessourceExists(fileName): + print "%s do not exists\n" % fileName + return False + return True + + def _printMessageAndClean(self, msg): + print "%s in %s functional test\n" % (msg, self._pipelineName) + sys.stdout.flush() + os.chdir("../") + if self._clean: + shutil.rmtree(self._workingDir) + + def _areTwoFilesIdenticalByScript( self, expFileName, obsFileName, scriptName): + cmd = "%s -v 1 -r %s -t %s 2>/dev/null" % (scriptName, expFileName, obsFileName) + log = os.system(cmd) + print + sys.stdout.flush() + if log != 0: + return False + else: + return True \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/utils/RepetConfigParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/RepetConfigParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,38 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from ConfigParser import ConfigParser + + +class RepetConfigParser(ConfigParser): + + def optionxform(self, optionstr): + return optionstr \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/utils/RepetOptionParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/RepetOptionParser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +""" +Class overriding optparse.OptionParser default epilog formatter. +The resulting epilog display format is the same as if the corresponding string was printed. +""" + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from optparse import OptionParser +from optparse import BadOptionError +from optparse import OptionValueError +SUPPRESS_USAGE = "SUPPRESS"+"USAGE" + +class RepetOptionParser(OptionParser): + + def parse_args(self, args=None, values=None): + rargs = self._get_args(args) + if not rargs: + rargs = ["-h"] + if values is None: + values = self.get_default_values() + self.rargs = rargs + self.largs = largs = [] + self.values = values + try: + self._process_args(largs, rargs, values) + except (BadOptionError, OptionValueError), err: + self.error(str(err)) + args = largs + rargs + return self.check_values(values, args) + + def set_usage(self, usage): + if not usage or usage is SUPPRESS_USAGE: + self.usage = None + elif usage.lower().startswith("usage: "): + self.usage = usage[7:] + else: + self.usage = usage + + def format_epilog(self, formatter): + if self.epilog != None: + return self.epilog + else : + return "" + + def format_description(self, formatter): + if self.description != None: + return self.description + else : + return "" diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/utils/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/BedWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/BedWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,100 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class BedWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with BED format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "track name=reads description=\"Reads\" useScore=0 visibility=full offset=0\n" + super(BedWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["bed"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "bed" + + + def setTitle(self, title): + """ + Set the title of the track + @param title: the title of the track + @type title: string + """ + if title != None: + self.header = "track name=%s description=\"%s\" useScore=0 visibility=full offset=0\n" % (title, title) + + + def copyProperties(self, bedParser): + """ + Copy the properties collected by a parser, to produce a similar output + @param bedParser: a BED Parser parser + @type bedParser: class L{BedParser} + """ + self.setTitle(bedParser.title) + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + return transcript.printBed() + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/CsvWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/CsvWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,153 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from commons.core.writer.TranscriptListWriter import TranscriptListWriter +from SMART.Java.Python.misc.Progress import Progress + +class CsvWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with CSV (Excel) format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(CsvWriter, self).__init__(fileName, verbosity) + self.header = "" + self.title = "chromosome,start,end,strand,exons,tags\n" + self.modified = False + + + def __del__(self): + """ + Destructor + (Trick to write 1 tag per column) + """ + if self.handle != None: + self.modifyCsv() + super(CsvWriter, self).__del__() + + + def close(self): + if self.handle != None: + self.modifyCsv() + super(CsvWriter, self).close() + + + def modifyCsv(self): + """ + Clean CSV file so that there is one column per tag + """ + if self.modified: + return + + # read all the tags + self.handle.close() + self.handle = open(self.fileName) + nbFirstFields = 5 + tags = set() + if self.verbosity >= 10: + print "Modifying CSV file..." + number = -1 + for number, line in enumerate(self.handle): + if number != 0: + theseTags = line.strip().split(",")[nbFirstFields:] + for tag in theseTags: + if tag.find("=") != -1: + (key, value) = tag.split("=", 1) + if value != None: + tags.add(key) + if self.verbosity >= 10: + print " ...done" + + # re-write the file + tmpFileName = "tmpFile%d.csv" % (random.randint(0, 100000)) + tmpFile = open(tmpFileName, "w") + self.handle.seek(0) + progress = Progress(number + 1, "Re-writting CSV file", self.verbosity) + tmpFile.write(self.title.replace("tags", ",".join(sorted(tags)))) + for line in self.handle: + tagValues = dict([(key, None) for key in tags]) + tmpFile.write(",".join(line.strip().split(",")[:nbFirstFields])) + for tag in line.strip().split(",")[nbFirstFields:]: + if tag.find("=") != -1: + key = tag.split("=", 1)[0] + tagValues[key] = tag.split("=", 1)[1] + else: + tagValues[key] += ";%s" % (tag) + for key in sorted(tagValues.keys()): + tmpFile.write(",%s" % (tagValues[key])) + tmpFile.write("\n") + progress.inc() + tmpFile.close() + + # replace former file + import shutil + shutil.move(tmpFile.name, self.fileName) + progress.done() + self.modified = True + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["csv", "xls", "excel"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "csv" + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + return transcript.printCsv() + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/EmblWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/EmblWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,116 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class EmblWriter(TranscriptListWriter): + """ + A class that writes a transcript list into several files with EMBL format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.handles = {} + self.handle = None + + + def __del__(self): + """ + Destructor + Trick to append the sequences at the end of the EMBL files + """ + handle = open(self.sequenceFileName) + currentHandle = None + for line in handle: + if line[0] == ">": + chromosome = line[1:].strip() + if chromosome in self.handles: + currentHandle = self.handles[chromosome] + else: + currentHandle = None + else: + if currentHandle != None: + currentHandle.write(line) + handle.close() + for handle in self.handles.values(): + handle.close() + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["embl"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "embl" + + + def addTranscript(self, transcript): + """ + Add a transcript to the list of transcripts to be written + @param transcript: transcript to be written + @type transcript: class L{Transcript} + """ + chromosome = transcript.getChromosome() + if chromosome not in self.handles: + self.handles[chromosome] = open("%s%s.embl" % (self.fileName[:-len(".embl")], chromosome.title()), "w") + self.handles[chromosome].write(self.printTranscript(transcript)) + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + return transcript.printEmbl() + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/FastaWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/FastaWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,77 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.SequenceListWriter import SequenceListWriter + + +class FastaWriter(SequenceListWriter): + """ + A class that writes a sequence list into a file with FASTA format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(FastaWriter, self).__init__(fileName, verbosity) + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["fasta", "mfa"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "fasta" + + + def getLine(self, sequence): + """ + Convert a sequence + @param sequence: sequence to be written + @type sequence: class L{Sequence} + """ + return sequence.printFasta() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/FastqWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/FastqWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,78 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.SequenceListWriter import SequenceListWriter + + +class FastqWriter(SequenceListWriter): + """ + A class that writes a sequence list into a file with FASTQ format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(FastqWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["fastq", "mfq"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "fastq" + + + def getLine(self, sequence): + """ + Convert a sequence + @param sequence: sequence to be written + @type sequence: class L{Sequence} + """ + return sequence.printFastq() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/GbWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/GbWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,102 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class GbWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GBrowse format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "[READS]\nbgcolor = red\nstrand_arrow = 1\n\n" + super(GbWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gb", "gbrowse"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gb" + + + def setColor(self, color): + """ + Set the color of the track + @param color: the color of the track + @type color: string + """ + if color != None: + self.header = "[READS]\nbgcolor= %s\nstrand_arrow = 1\n\n" % (color) + + + def copyProperties(self, gbParser): + """ + Copy the properties collected by a parser, to produce a similar output + @param gbParser: a GBrowse parser + @type gbParser: class L{GbParser} + """ + self.setColor(gbParser.color) + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + Possibly skip the reference if already put + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + if self.lastChromosome != None and self.lastChromosome == transcript.getChromosome(): + return transcript.printGBrowseLine() + return transcript.printGBrowse() diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/Gff2Writer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/Gff2Writer.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,89 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class Gff2Writer(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GFF2 format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = "" + super(Gff2Writer, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gff2"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gff2" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + return transcript.printGff2(self.title) + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/Gff3Writer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/Gff3Writer.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,130 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class Gff3Writer(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GFF3 format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0, title="S-MART", feature="transcript", featurePart="exon"): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = title + self.feature = feature + self.featurePart = featurePart + super(Gff3Writer, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gff3", "gff"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gff3" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + def setFeature(self, feature): + """ + Set the name of the feature + @param title: the title of the feature + @type feature: string + """ + self.feature = feature + + def setFeaturePart(self, featurePart): + """ + Set the name of the feature part + @param title: the title of the feature part + @type featurePart: string + """ + self.featurePart = featurePart + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + direction = "+" + if transcript.getDirection() == -1: + direction = "-" + transcript.sortExonsIncreasing() + if "ID" not in transcript.getTagValues(): + transcript.setTagValue("ID", transcript.getUniqueName()) + feature = self.feature + tags = transcript.tags + if "feature" in transcript.getTagNames(): + feature = transcript.getTagValue("feature") + del transcript.tags["feature"] + score = "." + if "score" in transcript.getTagNames(): + score = "%d" % (int(transcript.getTagValue("score"))) + del transcript.tags["score"] + comment = transcript.getTagValues(";", "=") + string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (transcript.getChromosome(), self.title, feature, transcript.getStart(), transcript.getEnd(), score, direction, comment) + if len(transcript.exons) > 1: + for i, exon in enumerate(transcript.getExons()): + if "score" in exon.getTagNames(): + score = "%d" % (int(exon.getTagValue("score"))) + string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tID=%s-%s%d;Name=%s-%s%d;Parent=%s\n" % (transcript.getChromosome(), self.title,self.featurePart, exon.getStart(), exon.getEnd(), score, direction, transcript.getTagValue("ID"),self.featurePart, i+1, transcript.name,self.featurePart, i+1, transcript.getTagValue("ID")) + self.tags = tags + return string + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/GtfWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/GtfWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,89 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class GtfWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GTF format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = "S-MART" + super(GtfWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gtf", "gtf2"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gtf" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + + def printTranscript(self, transcript): + """ + Export the given transcript with GTF format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + return transcript.printGtf(self.title) + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/MapWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/MapWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,100 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class MapWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GFF3 format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0, title="S-MART"): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = title + TranscriptListWriter.__init__(self, fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["map"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "map" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + + def printTranscript(self, transcript): + """ + Export the given transcript to map format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + name = transcript.name + if "nbOccurrences" in transcript.getTagNames() and transcript.getTagValue("nbOccurrences") != 1 and transcript.getTagValue("occurrences"): + name = "%s-%d" % (name, transcript.getTagValue("occurrence")) + sizes = [] + starts = [] + transcript.sortExonsIncreasing() + for exon in transcript.getExons(): + sizes.append("%d" % (exon.getSize())) + starts.append("%d" % (exon.getStart() - transcript.getStart())) + return "%s\t%s\t%d\t%d\n" % (name, transcript.getChromosome(), transcript.getStart(), transcript.getEnd()+1) + + + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/MySqlTranscriptWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/MySqlTranscriptWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,215 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable +from SMART.Java.Python.misc.Progress import Progress + +class MySqlTranscriptWriter(object): + """ + A class that writes a transcript list into a mySQL table + @ivar name: name of the tables + @type name: string + @ivar tables: the tables + @type tables: dict of L{MySqlTranscriptTable} + @ivar mySqlConnection: connection to a MySQL database + @type mySqlConnection: class L{MySqlConnection} + @ivar tmpTranscriptFileHandles: files where transcripts are temporary stored, before copy into database + @type tmpTranscriptFileHandles: dict of file handles + @ivar nbTranscriptsByChromosome: number of transcripts written + @type nbTranscriptsByChromosome: dict of int (one for each chromosome) + @ivar randomNumber: a random number, used for having a unique name for the tables + @type randomNumber: int + @ivar toBeWritten: there exists transcripts to be copied into database + @type toBeWritten: bool + @ivar verbosity: verbosity + @type verbosity: int + """ + + + def __init__(self, connection, name = None, verbosity = 0): + """ + Constructor + @param name: name of the file + @type name: string + @param verbosity: verbosity + @type verbosity: int + """ + self.name = name + self.verbosity = verbosity + self.tables = {} + self.indices = {} + self.tmpTranscriptFileHandles = {} + self.nbTranscriptsByChromosome = {} + self.toBeWritten = False + self.randomNumber = random.randint(0, 100000) + self.mySqlConnection = connection + self.nbTmpFiles = 100 + self.transcriptValues = {} + self.nbTranscriptValues = 1000 + if self.name != None: + pos = self.name.rfind(os.sep) + if pos != -1: + self.name = self.name[pos+1:] + + + def __del__(self): + """ + Destructor + Possibly write into into database the last transcripts + """ + if self.toBeWritten: + self.write() + + + def addIndex(self, name, values): + """ + Add an index to the tables + @param name: name of the index + @type name: string + @param values: values to index + @type values: list of strings + """ + self.indices[name] = values + + + def createTable(self, chromosome): + """ + Create a table for a chromosome + @param chromosome: a chromosome name + @type chromosome: string + """ + self.tables[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.name, chromosome, self.verbosity) + self.tables[chromosome].createTranscriptTable() + for name, values in self.indices.iteritems(): + self.tables[chromosome].createIndex("%s_%s_%d" % (name, chromosome, self.randomNumber), values) + + + + def addTranscript(self, transcript): + """ + Add a transcript to the list of transcripts to be written + @param transcript: transcript to be written + @type transcript: class L{Transcript} + """ + chromosome = transcript.getChromosome() + if chromosome not in self.tables: + self.createTable(chromosome) + self.nbTranscriptsByChromosome[chromosome] = 1 + if chromosome not in self.transcriptValues: + self.transcriptValues[chromosome] = [] + + self.transcriptValues[chromosome].append(transcript.getSqlValues()) + + self.nbTranscriptsByChromosome[chromosome] += 1 + self.toBeWritten = True + if sum([len(transcripts) for transcripts in self.transcriptValues.values()]) > self.nbTranscriptValues: + self.write() + + + def addElement(self, element): + """ + Same as "addTranscript" + @param element: transcript to be written + @type element: class L{Transcript} + """ + self.addTranscript(element) + + +# def addTranscriptList(self, transcriptListParser): +# """ +# Add a list of transcripts to the transcripts to be written +# @param transcriptListParser: transcripts to be written +# @type transcriptListParser: class L{TranscriptListParser} +# """ +# progress = Progress(transcriptListParser.getNbTranscripts(), "Storing %s into database" % (transcriptListParser.fileName), self.verbosity) +# for transcript in transcriptListParser.getIterator(): +# self.addTranscript(transcript) +# progress.inc() +# progress.done() + + + def addTranscriptList(self, transcriptListParser): + """ + Add a list of transcripts to the transcripts to be written + @param transcriptListParser: transcripts to be written + @type transcriptListParser: class L{TranscriptListParser} + """ + self.transcriptListParser = transcriptListParser + self.mySqlConnection.executeManyFormattedQueriesIterator(self) + + + def getIterator(self): + """ + Iterator to the SQL commands to insert the list + """ + progress = Progress(self.transcriptListParser.getNbTranscripts(), "Storing %s into database" % (self.transcriptListParser.fileName), self.verbosity) + for transcript in self.transcriptListParser.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.tables: + self.createTable(chromosome) + self.nbTranscriptsByChromosome[chromosome] = self.nbTranscriptsByChromosome.get(chromosome, 0) + 1 + values = transcript.getSqlValues() + #yield "INSERT INTO '%s' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join([MySqlTable.formatSql(values[variable], self.tables[chromosome].types[variable], self.tables[chromosome].sizes[variable]) for variable in self.tables[chromosome].variables])) + yield ("INSERT INTO '%s' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join(["?"] * len(self.tables[chromosome].variables))), [values[variable] for variable in self.tables[chromosome].variables]) + progress.inc() + progress.done() + + + def write(self): + """ + Copy the content of the files into the database + (May add transcripts to already created databases) + """ + for chromosome in self.transcriptValues: + if chromosome in self.transcriptValues: + self.tables[chromosome].insertManyFormatted(self.transcriptValues[chromosome]) + self.transcriptValues = {} + self.toBeWritten = False + + + def getTables(self): + """ + Get the tables + @return: the mySQL tables + """ + if self.toBeWritten: + self.write() + return self.tables + + + + def removeTables(self): + """ + Drop the tables + """ + for chromosome in self.tables: + self.tables[chromosome].remove() \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/SamWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/SamWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,101 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class SamWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with SAM format + @ivar sizes: estimated sizes of the chromosomes + @type sizes: dict of string to int + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(SamWriter, self).__init__(fileName, verbosity) + self.sizes = {} + self.headerWritten = False + + + def close(self): + """ + Close file (trick to add header) + """ + super(SamWriter, self).close() + if self.headerWritten: + return + tmpFileName = "tmpFile%d.sam" % (random.randint(0, 100000)) + tmpHandle = open(tmpFileName, "w") + for chromosome, size in self.sizes.iteritems(): + tmpHandle.write("@SQ\tSN:%s\tLN:%d\n" % (chromosome, size)) + self.handle = open(self.fileName) + for line in self.handle: + tmpHandle.write(line) + tmpHandle.close() + self.handle.close() + os.rename(tmpFileName, self.fileName) + self.headerWritten = True + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["sam"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "sam" + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + self.sizes[transcript.getChromosome()] = max(transcript.getEnd(), self.sizes.get(transcript.getChromosome(), 0)) + return transcript.printSam() + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/SequenceListWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/SequenceListWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,94 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class SequenceListWriter(object): + """ + An interface that writes a list of sequences into a file + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.handle = open(self.fileName, "w") + + + def __del__(self): + """ + Destructor + """ + self.close() + + + def write(self): + """ + No-op + """ + pass + + + def close(self): + """ + Close writer + """ + if self.handle != None: + self.handle.close() + + + def addSequence(self, sequence): + """ + Add a sequence to the list of sequence to be written + @param sequence: sequence to be written + @type sequence: class L{Sequence} + """ + self.handle.write(self.getLine(sequence)) + + + def addElement(self, element): + """ + Same as "addSequence" + @param element: sequence to be written + @type element: class L{Sequence} + """ + self.addSequence(element) + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/TranscriptListWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/TranscriptListWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,163 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from SMART.Java.Python.misc.Progress import Progress + +class TranscriptListWriter(object): + """ + An interface that writes a transcript list into a file + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + @ivar started: whether some transcripts have already been writted + @type started: boolean + @ivar lastChromosome: the chromosome on which the transcript which was inserted last + @type lastChromosome: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.handle = open(self.fileName, "w") + self.started = False + self.lastChromosome = None + self.header = "" + self.sequenceFileName = None + + + def __del__(self): + """ + Destructor + """ + self.close() + + + def close(self): + """ + Close writer + """ + if self.handle != None and not self.handle.closed: + self.handle.close() + self.handle = None + + + def addTranscript(self, transcript): + """ + Add a transcript to the list of transcripts to be written + @param transcript: transcript to be written + @type transcript: class L{Transcript} + """ + if not self.started: + self.handle.write(self.header) + self.started = True + + self.handle.write(self.printTranscript(transcript)) + self.lastChromosome = transcript.getChromosome() + + + def addElement(self, element): + """ + Same as "addTranscript" + @param element: transcript to be written + @type element: class L{Transcript} + """ + self.addTranscript(element) + + + def addTranscriptList(self, transcriptList): + """ + Add a list of transcripts to the transcripts to be written + @param transcriptList: transcripts to be written + @type transcriptList: class L{TranscriptList} + """ + progress = Progress(transcriptList.getNbTranscripts(), "Writing transcripts", self.verbosity) + for transcript in transcriptList.getIterator(): + self.addTranscript(transcript) + progress.inc() + progress.done() + + + def addTranscriptTable(self, transcriptTable): + """ + Add a list of transcripts in a mySQL table to the transcripts to be written + @param transcriptTable: transcripts to be written + @type transcriptTable: class L{MySqlTranscriptTable} + """ + for transcript in transcriptTable.getIterator(): + self.addTranscript(transcript) + + + def setTitle(self, title): + """ + Possibly write a title for the list (by default, do nothing) + @param title: a title for the list + @type title: string + """ + pass + + def setFeature(self, feature): + """ + Set the name of the feature + @param title: the title of the feature + @type feature: string + """ + pass + + def setFeaturePart(self, featurePart): + """ + Set the name of the feature part + @param title: the title of the feature part + @type featurePart: string + """ + pass + + + def addSequenceFile(self, fileName): + """ + Get the multi-fasta file of the sequences + """ + self.sequenceFileName = fileName + + + def write(self): + """ + No-op + """ + pass diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/TranscriptWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/TranscriptWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,189 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import sys +from commons.core.writer.WriterChooser import WriterChooser +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + +class TranscriptWriter(object): + """ + An interface class that writes a list of transcripts, handle different formats + @ivar container: container of the data + @type container: L{TranscriptContainer} + @ivar format: format of the data to be printed + @type format: string + @ivar file: the file where to print + @type file: string + @ivar type: type of the data (transcripts, mappings or mySQL) + @type type: string + @ivar writer: a transcript list writer + @type writer: L{TranscriptListWriter} or None + @ivar mode: use a container or enter transcript one by one + @type mode: string + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, file, format, verbosity = 0): + """ + Constructor + @param container: container of the data + @type container: string + @param format: format of the data + @type format: string + @param file: file where to print + @type file: string + @param verbosity: verbosity + @type verbosity: int + """ + self.container = None + self.format = format + self.file = file + + self.verbosity = verbosity + self.type = None + self.writer = None + self.mode = None + if self.format == None: + sys.exit("Error! Writer input format is empty!") + + if self.format == "sql": + self.type = "sql" + pos = self.file.rfind(os.sep) + if pos > -1: + self.file = self.file[pos+1:] + self.writer = MySqlTranscriptWriter(self.file, self.verbosity) + else: + writerChooser = WriterChooser(self.verbosity) + writerChooser.findFormat(self.format) + self.writer = writerChooser.getWriter(self.file) + self.type = writerChooser.getType() + + + def close(self): + """ + Close writer + """ + if self.writer != None: + self.writer.close() + + + def setContainer(self, container): + """ + Set a container for the data + @param container: container of the data + @type container: class L{TranscriptContainer} + """ + self.container = container + if self.mode == "transcript": + raise Exception("Error! TranscriptWriter '%s' on 'transcript' mode is currently used on 'container' mode." % (self.file)) + self.mode = "container" + + + def addTranscript(self, transcript): + """ + Add a transcript to write + @param transcript: a transcript + @type transcript: class L{Transcript} + """ + self.writer.addTranscript(transcript) + if self.mode == "container": + sys.exit("Error! TranscriptWriter '%s' on 'container' mode is currently used on 'transcript' mode." % (self.file)) + self.mode = "transcript" + + + def addElement(self, transcript): + """ + Same as addTranscript + """ + self.addTranscript(transcript) + + + def setTitle(self, title): + """ + Possibly write a title for the list + @param title: a title for the list + @type title: string + """ + if self.writer != None: + self.writer.setTitle(title) + + def setFeature(self, feature): + """ + Possibly Set the name of the feature + @param title: the title of the feature + @type feature: string + """ + if self.writer != None: + self.writer.setFeature(feature) + + def setFeaturePart(self, featurePart): + """ + Possibly Set the name of the feature part + @param title: the title of the feature part + @type featurePart: string + """ + if self.writer != None: + self.writer.setFeaturePart(featurePart) + + def setStrands(self, strands): + """ + Possibly consider both strands separately + @param strands: whether both strands should be considered separately + @type strands: boolean + """ + if self.writer != None: + self.writer.setStrands(strands) + + + def write(self): + """ + Write the content and possibly convert data + """ + if self.type == "transcript" or self.type == "sequence": + if self.mode == "container": + self.writer.addTranscriptList(self.container) + return + + if self.mode == "transcript" or self.type == "sequence": + self.writer.write() + return + + if self.container.format != "sql": + self.container.storeIntoDatabase() + tables = self.container.getTables() + for chromosome in tables: + tables[chromosome].rename("%s_%s" % (self.file, chromosome)) + return + + + def addSequenceFile(self, fileName): + self.writer.addSequenceFile(fileName) + \ No newline at end of file diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/UcscWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/UcscWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,73 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.BedWriter import BedWriter + +class UcscWriter(BedWriter): + """ + A class that writes a transcript list into a file with UCSC BED format (minor differences with BED format) + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(UcscWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["ucsc"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "bed" + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + return transcript.printUcsc() + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/WigWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/WigWriter.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,139 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class WigWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with WIGGLE format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.data = {-1: {}, 0: {}, 1: {}} + self.title = "Reads" + self.strands = False + self.handle = None + + + def __del__(self): + """ + Destructor + Actually print the file + """ + strand2string = {-1: "-", 1: "+", 0: ""} + self.handle = open(self.fileName, "w") + self.handle.write("track type=wiggle_0 name=\"%s\"\n" % (self.title)) + for strand in self.data: + for chromosome in sorted(self.data[strand]): + self.handle.write("variableStep chrom=%s%s\n" % (chromosome, strand2string[strand])) + for pos in sorted(self.data[strand][chromosome]): + self.handle.write("%d\t%d\n" % (pos, self.data[strand][chromosome][pos])) + self.handle.close() + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["wig", "wiggle"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "wig" + + + def setTitle(self, title): + """ + Set the title of the track + @param title: the title of the track + @type title: string + """ + if title != None: + self.title = title + + + def setStrands(self, strands): + """ + Consider each strand separately + @param boolean: whether each strand should be considered separately + @type boolean: boolean + """ + self.strands = strands + + + def copyProperties(self, parser): + """ + Copy the properties collected by a parser, to produce a similar output + @param bedParser: a parser + @type bedParser: class L{TranscriptListWriter} + """ + self.setTitle(parser.title) + + + def addTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript} + @return: a string + """ + chromosome = transcript.getChromosome() + direction = transcript.getDirection() + if not self.strands: + direction = 0 + if chromosome not in self.data[direction]: + self.data[direction][chromosome] = {} + for exon in transcript.getExons(): + for pos in range(exon.getStart(), exon.getEnd()+1): + if pos not in self.data[direction][chromosome]: + self.data[direction][chromosome][pos] = 1 + else: + self.data[direction][chromosome][pos] += 1 diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/WriterChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/WriterChooser.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,127 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.writer.TranscriptListWriter import TranscriptListWriter +from commons.core.writer.SequenceListWriter import SequenceListWriter +from commons.core.writer.BedWriter import BedWriter +from commons.core.writer.CsvWriter import CsvWriter +from commons.core.writer.EmblWriter import EmblWriter +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from commons.core.writer.GbWriter import GbWriter +from commons.core.writer.Gff2Writer import Gff2Writer +from commons.core.writer.SamWriter import SamWriter +from commons.core.writer.UcscWriter import UcscWriter +from commons.core.writer.WigWriter import WigWriter +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.GtfWriter import GtfWriter +from commons.core.writer.MapWriter import MapWriter + + +class WriterChooser(object): + """ + A class that finds the correct writer + @ivar type: transcript / sequence writer + @type type: string + @ivar format: the format of the writer + @type format: string + @ivar writerClass: the class of the writer + @type writerClass: string + @ivar extension: default extension of the file + @type extension: string + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.type = None + self.format = None + self.writerClass = None + self.extension = None + self.verbosity = verbosity + + + def findFormat(self, format, type = None): + """ + Find the correct parser + @ivar format: the format + @type format: string + @ivar type: transcript sequence parser (None is all) + @type type: string + @return: a parser + """ + classes = {} + if (type == "transcript"): + classes = {TranscriptListWriter: "transcript"} + elif (type == "sequence"): + classes = {SequenceListWriter: "sequence"} + elif (type == None): + classes = {TranscriptListWriter: "transcript", SequenceListWriter: "sequence"} + else: + sys.exit("Do not understand format type '%s'" % (type)) + + for classType in classes: + for writerClass in classType.__subclasses__(): + if format in writerClass.getFileFormats(): + self.writerClass = writerClass + self.extension = writerClass.getExtension() + self.type = classes[classType] + return + sys.exit("Cannot get writer for format '%s'" % (format)) + + + def getWriter(self, fileName): + """ + Get the writer previously found + @return: the writer + """ + return self.writerClass(fileName, self.verbosity) + + + def getType(self): + """ + Get the type of writer previously found + @return: the type of writer + """ + return self.type + + + def getExtension(self): + """ + Get the default extension of writer previously found + @return: the extension + """ + return self.extension + diff -r d22fadc825e3 -r 2c0c0a89fad7 commons/core/writer/__init__.py diff -r d22fadc825e3 -r 2c0c0a89fad7 doc.pdf Binary file doc.pdf has changed diff -r d22fadc825e3 -r 2c0c0a89fad7 tool_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,48 @@ +
+
diff -r d22fadc825e3 -r 2c0c0a89fad7 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,6 @@ + + + + $REPOSITORY_INSTALL_DIR + +