Previous changeset 35:d94018ca4ada (2013-04-30) Next changeset 37:d22fadc825e3 (2013-05-02) |
Commit message:
Uploaded |
added:
LICENSE.txt README.txt SMART/Java/Python/CleanTranscriptFile.py SMART/Java/Python/ClusterizeByTags.py SMART/Java/Python/CollapseReads.py SMART/Java/Python/CompareOverlapping.py SMART/Java/Python/CompareOverlappingSmallQuery.py SMART/Java/Python/CompareOverlappingSmallRef.py SMART/Java/Python/ComputeCoverage.py SMART/Java/Python/CountReadGCPercent.py SMART/Java/Python/FindOverlapsOptim.py SMART/Java/Python/GetDifferentialExpression.py SMART/Java/Python/GetDistribution.py SMART/Java/Python/GetFlanking.py SMART/Java/Python/SelectByTag.py SMART/Java/Python/WrappGetDistribution.py SMART/Java/Python/WrappGetReadDistribution.py SMART/Java/Python/WrappPlotCoverage.py SMART/Java/Python/__init__.py SMART/Java/Python/changeGffFeatures.sh SMART/Java/Python/changeTagName.py SMART/Java/Python/cleaning/CleanerChooser.py SMART/Java/Python/cleaning/DefaultCleaner.py SMART/Java/Python/cleaning/GffCleaner.py SMART/Java/Python/cleaning/GtfCleaner.py SMART/Java/Python/cleaning/TranscriptListCleaner.py SMART/Java/Python/cleaning/__init__.py SMART/Java/Python/clusterize.py SMART/Java/Python/clusterizeBySlidingWindows.py SMART/Java/Python/compareOverlapping.py SMART/Java/Python/convertTranscriptFile.py SMART/Java/Python/coordinatesToSequence.py SMART/Java/Python/getDifference.py SMART/Java/Python/getDistance.py SMART/Java/Python/getDistribution.py SMART/Java/Python/getExons.py SMART/Java/Python/getIntrons.py SMART/Java/Python/getLetterDistribution.py SMART/Java/Python/getReadDistribution.py SMART/Java/Python/getSizes.py SMART/Java/Python/getWigData.py SMART/Java/Python/getWigDistance.py SMART/Java/Python/getWigProfile.py SMART/Java/Python/mapperAnalyzer.py SMART/Java/Python/mappingToCoordinates.py SMART/Java/Python/mergeSlidingWindowsClusters.py SMART/Java/Python/mergeTranscriptLists.py SMART/Java/Python/misc/MultipleRPlotter.py SMART/Java/Python/misc/Progress.py SMART/Java/Python/misc/RPlotter.py SMART/Java/Python/misc/UnlimitedProgress.py SMART/Java/Python/misc/Utils.py SMART/Java/Python/misc/__init__.py SMART/Java/Python/modifyFasta.py SMART/Java/Python/modifyGenomicCoordinates.py SMART/Java/Python/modifySequenceList.py SMART/Java/Python/mySql/MySqlConnection.py SMART/Java/Python/mySql/MySqlExonTable.py SMART/Java/Python/mySql/MySqlQuery.py SMART/Java/Python/mySql/MySqlTable.py SMART/Java/Python/mySql/MySqlTranscriptTable.py SMART/Java/Python/mySql/__init__.py SMART/Java/Python/ncList/.NCList.py.swp SMART/Java/Python/ncList/.NCListCursor.py.swp SMART/Java/Python/ncList/Benchmark.py SMART/Java/Python/ncList/ConvertToNCList.py SMART/Java/Python/ncList/FileSorter.py SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py SMART/Java/Python/ncList/FindOverlaps_naif.py SMART/Java/Python/ncList/NCIndex.py SMART/Java/Python/ncList/NCList.py SMART/Java/Python/ncList/NCListCursor.py SMART/Java/Python/ncList/NCListFilePickle.py SMART/Java/Python/ncList/NCListHandler.py SMART/Java/Python/ncList/NCListMerger.py SMART/Java/Python/ncList/NCListParser.py SMART/Java/Python/ncList/__init__.py SMART/Java/Python/plotCoverage.py SMART/Java/Python/plotRepartition.py SMART/Java/Python/plotTranscriptList.py SMART/Java/Python/removeExonLines.sh SMART/Java/Python/restrictFromSize.py SMART/Java/Python/restrictSequenceList.py SMART/Java/Python/restrictTranscriptList.py SMART/Java/Python/structure/Bins.py SMART/Java/Python/structure/Interval.py SMART/Java/Python/structure/Mapping.py SMART/Java/Python/structure/Sequence.py SMART/Java/Python/structure/SequenceList.py SMART/Java/Python/structure/SubMapping.py SMART/Java/Python/structure/Transcript.py SMART/Java/Python/structure/TranscriptContainer.py SMART/Java/Python/structure/TranscriptList.py SMART/Java/Python/structure/TranscriptListIterator.py SMART/Java/Python/structure/TranscriptListsComparator.py SMART/Java/Python/structure/__init__.py SMART/Java/Python/trimSequences.py SMART/Java/__init__.py SMART/__init__.py SMART/galaxy/CleanTranscriptFile.xml SMART/galaxy/Clusterize.xml SMART/galaxy/CollapseReads.xml SMART/galaxy/CompareOverlappingSmallQuery.xml SMART/galaxy/CompareOverlappingSmallRef.xml SMART/galaxy/ConvertTranscriptFile.xml SMART/galaxy/CountReadGCPercent.xml SMART/galaxy/GetDifferentialExpression.xml SMART/galaxy/GetFlanking.xml SMART/galaxy/SelectByTag.xml SMART/galaxy/WrappGetLetterDistribution.xml SMART/galaxy/changeGffFeatures.xml SMART/galaxy/changeTagName.xml SMART/galaxy/clusterizeBySlidingWindows.xml SMART/galaxy/compareOverlapping.xml SMART/galaxy/computeCoverage.xml SMART/galaxy/coordinatesToSequence.xml SMART/galaxy/getDifference.xml SMART/galaxy/getDistance.xml SMART/galaxy/getDistribution.xml SMART/galaxy/getExons.xml SMART/galaxy/getIntrons.xml SMART/galaxy/getReadDistribution.xml SMART/galaxy/getSizes.xml SMART/galaxy/getWigData.xml SMART/galaxy/getWigDistance.xml SMART/galaxy/getWigProfile.xml SMART/galaxy/mapperAnalyzer.xml SMART/galaxy/mergeSlidingWindowsClusters.xml SMART/galaxy/mergeTranscriptLists.xml SMART/galaxy/modifyGenomicCoordinates.xml SMART/galaxy/modifySequenceList.xml SMART/galaxy/plotCoverage.xml SMART/galaxy/plotTranscriptList.xml SMART/galaxy/removeExonLines.xml SMART/galaxy/restrictFromSize.xml SMART/galaxy/restrictTranscriptList.xml SMART/galaxy/trimSequences.xml commons/__init__.py commons/core/LoggerFactory.py commons/core/__init__.py commons/core/checker/AbstractChecker.py commons/core/checker/CheckerException.py commons/core/checker/CheckerUtils.py commons/core/checker/ConfigChecker.py commons/core/checker/ConfigException.py commons/core/checker/ConfigValue.py commons/core/checker/IChecker.py commons/core/checker/OldConfigChecker.py commons/core/checker/RepetException.py commons/core/checker/__init__.py commons/core/coord/Align.py commons/core/coord/AlignUtils.py commons/core/coord/ConvCoord.py commons/core/coord/Map.py commons/core/coord/MapUtils.py commons/core/coord/Match.py commons/core/coord/MatchUtils.py commons/core/coord/MergedRange.py commons/core/coord/Path.py commons/core/coord/PathUtils.py commons/core/coord/Range.py commons/core/coord/Set.py commons/core/coord/SetUtils.py commons/core/coord/SlidingWindow.py commons/core/coord/__init__.py commons/core/coord/align2set.py commons/core/parsing/.BamParser.py.swp commons/core/parsing/AxtParser.py commons/core/parsing/BamParser.py commons/core/parsing/BedParser.py commons/core/parsing/BlastParser.py commons/core/parsing/BlatFileParser.py commons/core/parsing/BlatParser.py commons/core/parsing/BlatToGff.py commons/core/parsing/BlatToGffForBesPaired.py commons/core/parsing/BowtieParser.py commons/core/parsing/CoordsParser.py commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py commons/core/parsing/ElandParser.py commons/core/parsing/ExoParser.py commons/core/parsing/FastaParser.py commons/core/parsing/FastqParser.py commons/core/parsing/FindRep.py commons/core/parsing/GbParser.py commons/core/parsing/GffParser.py commons/core/parsing/GtfParser.py commons/core/parsing/MapParser.py commons/core/parsing/MapperParser.py commons/core/parsing/MaqParser.py commons/core/parsing/MrepsToSet.py commons/core/parsing/Multifasta2SNPFile.py commons/core/parsing/MummerParser.py commons/core/parsing/NCListParser.py commons/core/parsing/NucmerParser.py commons/core/parsing/PalsToAlign.py commons/core/parsing/ParserChooser.py commons/core/parsing/PathNum2Id.py commons/core/parsing/PilerTAToGrouperMap.py commons/core/parsing/PklParser.py commons/core/parsing/PslParser.py commons/core/parsing/README_MultiFasta2SNPFile commons/core/parsing/RmapParser.py commons/core/parsing/SamParser.py commons/core/parsing/SeqmapParser.py commons/core/parsing/SequenceListParser.py commons/core/parsing/ShrimpParser.py commons/core/parsing/Soap2Parser.py commons/core/parsing/SoapParser.py commons/core/parsing/SsrParser.py commons/core/parsing/TranscriptListParser.py commons/core/parsing/VarscanFile.py commons/core/parsing/VarscanFileForGnpSNP.py commons/core/parsing/VarscanHit.py commons/core/parsing/VarscanHitForGnpSNP.py commons/core/parsing/VarscanHit_WithTag.py commons/core/parsing/VarscanHit_v2_2_8.py commons/core/parsing/VarscanHit_v2_2_8_WithTag.py commons/core/parsing/VarscanToVCF.py commons/core/parsing/WigParser.py commons/core/parsing/__init__.py commons/core/parsing/multifastaParserLauncher.py commons/core/seq/AlignedBioseqDB.py commons/core/seq/Bioseq.py commons/core/seq/BioseqDB.py commons/core/seq/BioseqUtils.py commons/core/seq/ClusterConsensusCollection.py commons/core/seq/FastaUtils.py commons/core/seq/__init__.py commons/core/utils/FileUtils.py commons/core/utils/PipelineStepFTests.py commons/core/utils/RepetConfigParser.py commons/core/utils/RepetOptionParser.py commons/core/utils/__init__.py commons/core/writer/BedWriter.py commons/core/writer/CsvWriter.py commons/core/writer/EmblWriter.py commons/core/writer/FastaWriter.py commons/core/writer/FastqWriter.py commons/core/writer/GbWriter.py commons/core/writer/Gff2Writer.py commons/core/writer/Gff3Writer.py commons/core/writer/GtfWriter.py commons/core/writer/MapWriter.py commons/core/writer/MySqlTranscriptWriter.py commons/core/writer/SamWriter.py commons/core/writer/SequenceListWriter.py commons/core/writer/TranscriptListWriter.py commons/core/writer/TranscriptWriter.py commons/core/writer/UcscWriter.py commons/core/writer/WigWriter.py commons/core/writer/WriterChooser.py commons/core/writer/__init__.py doc.pdf tool_conf.xml tool_dependencies.xml |
b |
diff -r d94018ca4ada -r 44d5973c188c LICENSE.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE.txt Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,506 @@\n+\n+CeCILL FREE SOFTWARE LICENSE AGREEMENT\n+\n+\n+ Notice\n+\n+This Agreement is a Free Software license agreement that is the result\n+of discussions between its authors in order to ensure compliance with\n+the two main principles guiding its drafting:\n+\n+ * firstly, compliance with the principles governing the distribution\n+ of Free Software: access to source code, broad rights granted to\n+ users,\n+ * secondly, the election of a governing law, French law, with which\n+ it is conformant, both as regards the law of torts and\n+ intellectual property law, and the protection that it offers to\n+ both authors and holders of the economic rights over software.\n+\n+The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre])\n+license are:\n+\n+Commissariat \xe0 l\'Energie Atomique - CEA, a public scientific, technical\n+and industrial research establishment, having its principal place of\n+business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France.\n+\n+Centre National de la Recherche Scientifique - CNRS, a public scientific\n+and technological establishment, having its principal place of business\n+at 3 rue Michel-Ange, 75794 Paris cedex 16, France.\n+\n+Institut National de Recherche en Informatique et en Automatique -\n+INRIA, a public scientific and technological establishment, having its\n+principal place of business at Domaine de Voluceau, Rocquencourt, BP\n+105, 78153 Le Chesnay cedex, France.\n+\n+\n+ Preamble\n+\n+The purpose of this Free Software license agreement is to grant users\n+the right to modify and redistribute the software governed by this\n+license within the framework of an open source distribution model.\n+\n+The exercising of these rights is conditional upon certain obligations\n+for users so as to preserve this status for all subsequent redistributions.\n+\n+In consideration of access to the source code and the rights to copy,\n+modify and redistribute granted by the license, users are provided only\n+with a limited warranty and the software\'s author, the holder of the\n+economic rights, and the successive licensors only have limited liability.\n+\n+In this respect, the risks associated with loading, using, modifying\n+and/or developing or reproducing the software by the user are brought to\n+the user\'s attention, given its Free Software status, which may make it\n+complicated to use, with the result that its use is reserved for\n+developers and experienced professionals having in-depth computer\n+knowledge. Users are therefore encouraged to load and test the\n+suitability of the software as regards their requirements in conditions\n+enabling the security of their systems and/or data to be ensured and,\n+more generally, to use and operate it in the same conditions of\n+security. This Agreement may be freely reproduced and published,\n+provided it is not altered, and that no provisions are either added or\n+removed herefrom.\n+\n+This Agreement may apply to any or all software for which the holder of\n+the economic rights decides to submit the use thereof to its provisions.\n+\n+\n+ Article 1 - DEFINITIONS\n+\n+For the purpose of this Agreement, when the following expressions\n+commence with a capital letter, they shall have the following meaning:\n+\n+Agreement: means this license agreement, and its possible subsequent\n+versions and annexes.\n+\n+Software: means the software in its Object Code and/or Source Code form\n+and, where applicable, its documentation, "as is" when the Licensee\n+accepts the Agreement.\n+\n+Initial Software: means the Software in its Source Code and possibly its\n+Object Code form and, where applicable, its documentation, "as is" when\n+it is first distributed under the terms and conditions of the Agreement.\n+\n+Modified Software: means the Software modified by at least one\n+Contribution.\n+\n+Source Code: means all the Software\'s instructions and program lines to\n+which access is required so as to modify the Software.\n+\n+Object Code: means the binary files originating from the co'..b"a case-by-case basis between the relevant Licensor and the\n+Licensee pursuant to a memorandum of understanding. The Licensor\n+disclaims any and all liability as regards the Licensee's use of the\n+name of the Software. No warranty is given as regards the existence of\n+prior rights over the name of the Software or as regards the existence\n+of a trademark.\n+\n+\n+ Article 10 - TERMINATION\n+\n+10.1 In the event of a breach by the Licensee of its obligations\n+hereunder, the Licensor may automatically terminate this Agreement\n+thirty (30) days after notice has been sent to the Licensee and has\n+remained ineffective.\n+\n+10.2 A Licensee whose Agreement is terminated shall no longer be\n+authorized to use, modify or distribute the Software. However, any\n+licenses that it may have granted prior to termination of the Agreement\n+shall remain valid subject to their having been granted in compliance\n+with the terms and conditions hereof.\n+\n+\n+ Article 11 - MISCELLANEOUS\n+\n+\n+ 11.1 EXCUSABLE EVENTS\n+\n+Neither Party shall be liable for any or all delay, or failure to\n+perform the Agreement, that may be attributable to an event of force\n+majeure, an act of God or an outside cause, such as defective\n+functioning or interruptions of the electricity or telecommunications\n+networks, network paralysis following a virus attack, intervention by\n+government authorities, natural disasters, water damage, earthquakes,\n+fire, explosions, strikes and labor unrest, war, etc.\n+\n+11.2 Any failure by either Party, on one or more occasions, to invoke\n+one or more of the provisions hereof, shall under no circumstances be\n+interpreted as being a waiver by the interested Party of its right to\n+invoke said provision(s) subsequently.\n+\n+11.3 The Agreement cancels and replaces any or all previous agreements,\n+whether written or oral, between the Parties and having the same\n+purpose, and constitutes the entirety of the agreement between said\n+Parties concerning said purpose. No supplement or modification to the\n+terms and conditions hereof shall be effective as between the Parties\n+unless it is made in writing and signed by their duly authorized\n+representatives.\n+\n+11.4 In the event that one or more of the provisions hereof were to\n+conflict with a current or future applicable act or legislative text,\n+said act or legislative text shall prevail, and the Parties shall make\n+the necessary amendments so as to comply with said act or legislative\n+text. All other provisions shall remain effective. Similarly, invalidity\n+of a provision of the Agreement, for any reason whatsoever, shall not\n+cause the Agreement as a whole to be invalid.\n+\n+\n+ 11.5 LANGUAGE\n+\n+The Agreement is drafted in both French and English and both versions\n+are deemed authentic.\n+\n+\n+ Article 12 - NEW VERSIONS OF THE AGREEMENT\n+\n+12.1 Any person is authorized to duplicate and distribute copies of this\n+Agreement.\n+\n+12.2 So as to ensure coherence, the wording of this Agreement is\n+protected and may only be modified by the authors of the License, who\n+reserve the right to periodically publish updates or new versions of the\n+Agreement, each with a separate number. These subsequent versions may\n+address new issues encountered by Free Software.\n+\n+12.3 Any Software distributed under a given version of the Agreement may\n+only be subsequently distributed under the same version of the Agreement\n+or a subsequent version, subject to the provisions of Article 5.3.4.\n+\n+\n+ Article 13 - GOVERNING LAW AND JURISDICTION\n+\n+13.1 The Agreement is governed by French law. The Parties agree to\n+endeavor to seek an amicable solution to any disagreements or disputes\n+that may arise during the performance of the Agreement.\n+\n+13.2 Failing an amicable solution within two (2) months as from their\n+occurrence, and unless emergency proceedings are necessary, the\n+disagreements or disputes shall be referred to the Paris Courts having\n+jurisdiction, by the more diligent Party.\n+\n+\n+Version 2.0 dated 2006-09-05.\n" |
b |
diff -r d94018ca4ada -r 44d5973c188c README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,51 @@ +---------- +| NAME | +---------- +S-MART + + +Description +----------- +Several tools are now available for mapping high-throughput sequencing data from a genome, but few can extract biological knowledge from the mapped reads. We have developed a toolbox, S-MART, which handles mapped RNA-Seq and ChIP-Seq data. + +S-MART is an intuitive and lightweight tool, performing several tasks that are usually required during the analysis of mapped RNA-Seq and ChIP-Seq reads, including data selection and data visualization. + +S-MART does not require a computer science background and thus can be used by all biologists through a graphical interface. S-MART can run on any personal computer, yielding results within an hour for most queries. + + +Instructions +------------ +Installation instructions and the user guide are available in the file "doc.pdf". + + +Copyright +--------- +Copyright INRA-URGI 2009-2010 + + +Authors +------- +Matthias Zytnicki + + +Contact +------- +urgi-contact@versailles.inra.fr + + +License +------- +This library is distributed under the terms of the CeCILL license +(http://www.cecill.info/index.en.html). +See the LICENSE.txt file. + + +Acknowledgements +---------------- +This product needs the following softwares : + * R, under the GNU General Public License + * MySQL, under the GNU General Public License + * Python, under the Python License, compatible with the GNU General Public License + * MySQL for Python, under the GNU General Public License + * Java, under the GNU General Public License + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/CleanTranscriptFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CleanTranscriptFile.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,74 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from SMART.Java.Python.cleaning.CleanerChooser import CleanerChooser + + +class CleanTranscriptFile(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.chooser = CleanerChooser(self.verbosity) + + def setInputFile(self, fileName, format): + self.chooser.findFormat(format) + self.cleaner = self.chooser.getCleaner() + self.cleaner.setInputFileName(fileName) + + def setOutputFile(self, fileName): + self.cleaner.setOutputFileName(fileName) + + def setAcceptedTypes(self, types): + if types != None: + self.cleaner.setAcceptedTypes(types) + + def run(self): + self.cleaner.clean() + + +if __name__ == "__main__": + + description = "Clean Transcript File v1.0.1: Clean a transcript file so that it is useable for S-MART. [Category: Other]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--types", dest="acceptedTypes", action="store", default=None, type="string", help="name of the types you want to keep in GFF/GTF (list separated by commas) [format: string] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + ctf = CleanTranscriptFile(options.verbosity) + ctf.setInputFile(options.inputFileName, options.format) + ctf.setOutputFile(options.outputFileName) + ctf.setAcceptedTypes(None if options.acceptedTypes == None else options.acceptedTypes.split(",")) + ctf.run() + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ClusterizeByTags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ClusterizeByTags.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,157 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2011\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import random\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection\n+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter\n+\n+\n+OPERATIONS = ("diff", "div")\n+BOOLTOSTRANDS = {True: [0], False: [-1, 1]}\n+\n+class ClusterizeByTags(object):\n+\n+ def __init__(self, verbosity):\n+ self.verbosity = verbosity\n+ self.connection = MySqlConnection(self.verbosity-1)\n+ self.defautValue = None\n+ self.maxDistance = None\n+ self.oneStrand = False\n+\n+ def setInputFile(self, fileName, format):\n+ chooser = ParserChooser(self.verbosity)\n+ chooser.findFormat(format)\n+ parser = chooser.getParser(fileName)\n+ writer = MySqlTranscriptWriter(self.connection, None, self.verbosity)\n+ writer.addTranscriptList(parser)\n+ writer.write()\n+ self.transcriptTables = writer.getTables()\n+\n+ def setOutputFile(self, fileName):\n+ self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)\n+\n+ def setTag(self, tagName, defaultValue):\n+ self.tagName = tagName\n+ self.defaultValue = defaultValue\n+\n+ def setThreshold(self, threshold):\n+ self.threshold = threshold\n+\n+ def setOperation(self, operation):\n+ self.operation = operation\n+ if self.operation not in OPERATIONS:\n+ raise Exception("Operation \'%s\' unsupported: choose among %s" % (self.operation, ", ".join(OPERATIONS)))\n+\n+ def setMaxDistance(self, distance):\n+ self.maxDistance = distance\n+\n+ def setOneStrand(self, oneStrand):\n+ self.oneStrand = oneStrand\n+\n+ def run(self):\n+ for chromosome in sorted(self.transcriptTables.keys()):\n+ progress = Progress(self.transcriptTables[chromosome].getNbElements(), "Analyzing %s" % (chromosome), self.verbosity)\n+ for strand in BOOLTOSTRANDS[self.oneStrand]:\n+ previousValue = None\n+ previousTrend = None\n+ previousTranscript = None\n+ sumValue = 0\n+ command = "SELECT * FROM %s" % (self.tran'..b' trend = value / previousValue\n+ if previousTranscript == None:\n+ sumValue = value\n+ elif (previousTrend == None or abs(trend - previousTrend) <= self.threshold) and (self.maxDistance == None or previousTranscript.getDistance(transcript) <= self.maxDistance) and (previousTranscript.getDirection() == transcript.getDirection() or not self.oneStrand):\n+ if previousTranscript.getDirection() != transcript.getDirection():\n+ transcript.reverse()\n+ previousTranscript.merge(transcript)\n+ transcript = previousTranscript\n+ sumValue += value\n+ previousTrend = trend\n+ else:\n+ previousTranscript.setTagValue(self.tagName, sumValue)\n+ self.writer.addTranscript(previousTranscript)\n+ sumValue = value\n+ previousTrend = None\n+ previousValue = value\n+ previousTranscript = transcript\n+ progress.inc()\n+ if previousTranscript != None:\n+ previousTranscript.setTagValue(self.tagName, sumValue)\n+ self.writer.addTranscript(previousTranscript)\n+ progress.done()\n+ self.writer.close()\n+\n+\n+if __name__ == "__main__":\n+ \n+ description = "Clusterize By Tags v1.0.1: Clusterize a set of element using their tag values. [Category: Merge]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+ parser.add_option("-t", "--tag", dest="tagName", action="store", type="string", help="name of the tag [format: string] [compulsory]")\n+ parser.add_option("-e", "--default", dest="defaultValue", action="store", default=None, type="int", help="default value for the tag [format: string]")\n+ parser.add_option("-r", "--threshold", dest="threshold", action="store", type="int", help="threshold between two consecutive tags [format: int] [compulsory]")\n+ parser.add_option("-p", "--operation", dest="operation", action="store", type="string", help="operation to apply between 2 different clusters to compare them [format: choice (diff, div)] [compulsory]")\n+ parser.add_option("-d", "--distance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance for 2 clusters to be merged [format: int] [default: None]")\n+ parser.add_option("-1", "--oneStrand", dest="oneStrand", action="store_true", default=False, help="also cluster the elements which are on different strands [format: bool] [default: False]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+ (options, args) = parser.parse_args()\n+\n+ cbt = ClusterizeByTags(options.verbosity)\n+ cbt.setInputFile(options.inputFileName, options.format)\n+ cbt.setOutputFile(options.outputFileName)\n+ cbt.setTag(option.tagName, option.defaultValue)\n+ cbt.setThreshold(option.threshold)\n+ cbt.setOperation(option.operation)\n+ cbt.setMaxDistance(operation.maxDistance)\n+ cbt.setOneStrand(operation.oneStrand)\n+ cbt.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/CollapseReads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CollapseReads.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,174 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os\n+from optparse import OptionParser, OptionGroup\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle\n+from SMART.Java.Python.ncList.FileSorter import FileSorter\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+\n+class CollapseReads(object):\n+ """\n+ Merge two reads if they have exactly the same genomic coordinates\n+ """\n+\n+ def __init__(self, verbosity = 0):\n+ self.verbosity = verbosity\n+ self.inputReader = None\n+ self.outputWriter = None\n+ self.strands = True\n+ self.nbRead = 0\n+ self.nbWritten = 0\n+ self.nbMerges = 0\n+ self.splittedFileNames = {}\n+\n+ def __del__(self):\n+ for fileName in self.splittedFileNames.values():\n+ os.remove(fileName)\n+ \n+ def close(self):\n+ self.outputWriter.close()\n+ \n+ def setInputFile(self, fileName, format):\n+ parserChooser = ParserChooser(self.verbosity)\n+ parserChooser.findFormat(format)\n+ self.parser = parserChooser.getParser(fileName)\n+ self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])\n+\n+ def setOutputFile(self, fileName):\n+ self.outputWriter = Gff3Writer(fileName, self.verbosity)\n+\n+ def getNbElements(self):\n+ return self.parser.getNbTranscripts()\n+\n+ def _sortFile(self):\n+ fs = FileSorter(self.parser, self.verbosity-4)\n+ fs.perChromosome(True)\n+ fs.setOutputFileName(self.sortedFileName)\n+ fs.sort()\n+ self.splittedFileNames = fs.getOutputFileNames()\n+ self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()\n+ self.nbRead = fs.getNbElements()\n+ \n+ def _iterate(self, chromosome):\n+ progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)\n+ transcripts = []\n+ parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)\n+ for newTranscript in parser.getIterator():\n+ newTranscripts = []\n+ for oldTranscript in transcripts:\n+ if self._checkOverlap(newTranscript, oldTranscript):\n+ '..b'pt2):\n+ self.nbMerges += 1\n+ transcript2.setDirection(transcript1.getDirection())\n+ transcript1.merge(transcript2)\n+\n+ def _write(self, transcript):\n+ self.nbWritten += 1\n+ self.outputWriter.addTranscript(transcript)\n+\n+ def _checkOverlap(self, transcript1, transcript2):\n+ if transcript1.getStart() != transcript2.getStart() or transcript1.getEnd() != transcript2.getEnd():\n+ return False\n+ return (not self.strands or transcript1.getDirection() == transcript2.getDirection())\n+\n+ def _checkPassed(self, transcript1, transcript2):\n+ return (transcript2.getStart() < transcript1.getStart())\n+\n+ def collapseChromosome(self, chromosome):\n+ progress = Progress(table.getNbElements(), "Analysing chromosome %s" % (chromosome), self.verbosity)\n+ command = "SELECT * FROM %s ORDER BY start ASC, end DESC" % (table.name)\n+ transcriptStart = None\n+ transcriptEnd = None\n+ transcriptDirection = None\n+ currentTranscript = None\n+ if self.strands:\n+ command += ", direction"\n+ for index, transcript in table.selectTranscripts(command, True):\n+ self.nbRead += 1\n+ if not self.strands:\n+ transcript.setDirection("+")\n+ if transcriptStart != transcript.getStart() or transcriptEnd != transcript.getEnd() or transcriptDirection != transcript.getDirection():\n+ self.writeTranscript(currentTranscript)\n+ transcriptStart = transcript.getStart()\n+ transcriptEnd = transcript.getEnd()\n+ transcriptDirection = transcript.getDirection()\n+ currentTranscript = transcript\n+ else:\n+ currentTranscript.setTagValue("nbElements", (currentTranscript.getTagValue("nbElements") + 1) if "nbElements" in currentTranscript.getTagNames() else 1)\n+ progress.inc()\n+ self.writeTranscript(currentTranscript)\n+ progress.done()\n+\n+ def collapse(self):\n+ self._sortFile()\n+ for chromosome in sorted(self.nbElementsPerChromosome.keys()):\n+ self._iterate(chromosome)\n+ self.outputWriter.close()\n+ if self.verbosity > 1:\n+ print "# reads read: %d" % (self.nbRead)\n+ print "# reads written: %d (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbRead * 100)\n+ print "# reads merges: %d" % (self.nbMerges)\n+\n+if __name__ == "__main__":\n+ \n+ # parse command line\n+ description = "Collapse Reads v1.0.3: Merge two reads if they have exactly the same genomic coordinates. [Category: Merge]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]")\n+ parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: mapping file format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")\n+ parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="merge elements on 2 different strands [format: bool] [default: false]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")\n+ (options, args) = parser.parse_args()\n+\n+ collapser = CollapseReads(options.verbosity)\n+ collapser.setInputFile(options.inputFileName, options.format)\n+ collapser.setOutputFile(options.outputFileName)\n+ collapser.strands = not options.strands\n+ collapser.collapse()\n+ collapser.close()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/CompareOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlapping.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,491 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os, struct, time, random\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.ncList.NCList import NCList\n+from SMART.Java.Python.ncList.NCListCursor import NCListCursor\n+from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle\n+from SMART.Java.Python.ncList.NCListHandler import NCListHandler\n+from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+from SMART.Java.Python.misc import Utils\n+try:\n+\timport cPickle as pickle\n+except:\n+\timport pickle\n+\n+REFERENCE = 0\n+QUERY = 1\n+TYPES = (REFERENCE, QUERY)\n+TYPETOSTRING = {0: "reference", 1: "query"}\n+\n+class CompareOverlapping(object):\n+\n+\tdef __init__(self, verbosity = 1):\n+\t\tself._outputFileName\t\t = "outputOverlaps.gff3"\n+\t\tself._iWriter\t\t\t\t = None\n+\t\tself._nbOverlappingQueries\t = 0\n+\t\tself._nbOverlaps\t\t\t = 0\n+\t\tself._nbLines\t\t\t\t = {REFERENCE: 0, QUERY: 0}\n+\t\tself._verbosity\t\t\t\t = verbosity\n+\t\tself._ncLists\t\t\t\t = {}\n+\t\tself._cursors\t\t\t\t = {}\n+\t\tself._splittedFileNames\t\t = {}\n+\t\tself._nbElements\t\t\t = {}\n+\t\tself._nbElementsPerChromosome = {}\n+\t\tself._inputFileNames\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._inputFileFormats\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._starts\t\t\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._ends\t\t\t\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._fivePrimes\t\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._threePrimes\t\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._ncListHandlers\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._convertedFileNames\t = {REFERENCE: False, QUERY: False}\n+\t\tself._sorted = False\n+\t\tself._index = False\n+\t\tself._introns\t\t\t\t = False\n+\t\tself._antisense\t\t\t\t = False\n+\t\tself._colinear\t\t\t\t = False\n+\t\tself._invert\t\t\t\t = False\n+\t\tself._distance\t\t\t\t = 0\n+\t\tself._minOverlap\t\t\t = 1\n+\t\tself._pcOverlap\t\t\t\t = None\n+\t\tself._included\t\t\t\t = False\n+\t\tself._including\t\t\t\t = False\n+\t\tself._outputNotOverlapping\t = False\n+\t\tself._tmpRefFileName\t\t = None\n+\t\tself._currentQueryTranscript = None\n+\t\tself._currentOrQueryTranscript = None\n+\t'..b'in file 1 (do not use it with -S) [format: int]")\n+\tparser.add_option("-u", "--end2",\t\t\t dest="end2",\t\t action="store",\t default=None, type="int",\thelp="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")\n+\tparser.add_option("-t", "--intron",\t\t dest="introns",\t\t action="store_true", default=False,\t\t\t\thelp="also report introns [format: bool] [default: false]")\n+\tparser.add_option("-E", "--5primeExtension1", dest="fivePrime1",\t action="store",\t default=None, type="int",\thelp="extension towards 5\' in file 1 [format: int]")\n+\tparser.add_option("-e", "--5primeExtension2", dest="fivePrime2",\t action="store",\t default=None, type="int",\thelp="extension towards 5\' in file 2 [format: int]")\n+\tparser.add_option("-N", "--3primeExtension1", dest="threePrime1",\t action="store",\t default=None, type="int",\thelp="extension towards 3\' in file 1 [format: int]")\n+\tparser.add_option("-n", "--3primeExtension2", dest="threePrime2",\t action="store",\t default=None, type="int",\thelp="extension towards 3\' in file 2 [format: int]")\n+\tparser.add_option("-c", "--colinear",\t\t dest="colinear",\t\t action="store_true", default=False,\t\t\t\thelp="colinear only [format: bool] [default: false]")\n+\tparser.add_option("-a", "--antisense",\t\t dest="antisense",\t\t action="store_true", default=False,\t\t\t\thelp="antisense only [format: bool] [default: false]")\n+\tparser.add_option("-d", "--distance",\t\t dest="distance",\t action="store",\t default=0,\t type="int",\thelp="accept some distance between query and reference [format: int]")\n+\tparser.add_option("-k", "--included",\t\t dest="included",\t action="store_true", default=False,\t\t\t\thelp="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")\n+\tparser.add_option("-K", "--including",\t\t dest="including",\t action="store_true", default=False,\t\t\t\thelp="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")\n+\tparser.add_option("-m", "--minOverlap",\t\t dest="minOverlap",\t action="store",\t default=1,\t type="int",\thelp="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")\n+\tparser.add_option("-p", "--pcOverlap",\t\t dest="pcOverlap",\t action="store",\t default=None, type="int",\thelp="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")\n+\tparser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False,\t\t\t\thelp="also output not overlapping data [format: bool] [default: false]")\n+\tparser.add_option("-x", "--exclude",\t\t dest="exclude",\t\t action="store_true", default=False,\t\t\t\thelp="invert the match [format: bool] [default: false]")\n+\tparser.add_option("-v", "--verbosity",\t\t dest="verbosity",\t\t action="store",\t default=1,\t type="int",\thelp="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tco = CompareOverlapping(options.verbosity)\n+\tco.setInput(options.inputFileName1, options.format1, QUERY)\n+\tco.setInput(options.inputFileName2, options.format2, REFERENCE)\n+\tco.setOutput(options.output)\n+\tco.setSorted(options.sorted)\n+\tco.setIndex(options.index)\n+\tco.restrictToStart(options.start1, QUERY)\n+\tco.restrictToStart(options.start2, REFERENCE)\n+\tco.restrictToEnd(options.end1, QUERY)\n+\tco.restrictToEnd(options.end2, REFERENCE)\n+\tco.extendFivePrime(options.fivePrime1, QUERY)\n+\tco.extendFivePrime(options.fivePrime2, REFERENCE)\n+\tco.extendThreePrime(options.threePrime1, QUERY)\n+\tco.extendThreePrime(options.threePrime2, REFERENCE)\n+\tco.acceptIntrons(options.introns)\n+\tco.getAntisenseOnly(options.antisense)\n+\tco.getColinearOnly(options.colinear)\n+\tco.getInvert(options.exclude)\n+\tco.setMaxDistance(options.distance)\n+\tco.setMinOverlap(options.minOverlap)\n+\tco.setPcOverlap(options.pcOverlap)\n+\tco.setIncludedOnly(options.included)\n+\tco.setIncludingOnly(options.including)\n+\tco.includeNotOverlapping(options.notOverlapping)\n+\tco.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/CompareOverlappingSmallQuery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlappingSmallQuery.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,261 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2011\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Mapping import Mapping\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+\n+MINBIN = 3\n+MAXBIN = 7\n+REFERENCE = 0\n+QUERY = 1\n+\n+def getBin(start, end):\n+\tfor i in range(MINBIN, MAXBIN + 1):\n+\t\tbinLevel = 10 ** i\n+\t\tif int(start / binLevel) == int(end / binLevel):\n+\t\t\treturn int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))\n+\treturn int((MAXBIN + 1) * 10 ** (MAXBIN + 1))\n+\n+def getOverlappingBins(start, end):\n+\tarray\t= []\n+\tbigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))\n+\tfor i in range(MINBIN, MAXBIN + 1):\n+\t\tbinLevel = 10 ** i\n+\t\tarray.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))\n+\tarray.append((bigBin, bigBin))\n+\treturn array\n+\n+\n+class CompareOverlappingSmallQuery(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.verbosity = verbosity\n+\t\tself.tableNames = {}\n+\t\tself.nbQueries = 0\n+\t\tself.nbRefs\t = 0\n+\t\tself.nbWritten = 0\n+\t\tself.nbOverlaps = 0\n+\t\tself.distance = None\n+\t\tself.invert = False\n+\t\tself.antisense = False\n+\t\tself.collinear = False\n+\t\tself.pcOverlapQuery = False\n+\t\tself.pcOverlapRef = False\n+\t\tself.minOverlap = False\n+\t\tself.included = False\n+\t\tself.including = False\n+\t\tself.bins\t = {}\n+\t\tself.overlaps = {}\n+\t\tself.notOverlapping = False\n+\n+\tdef setReferenceFile(self, fileName, format):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tself.refParser = chooser.getParser(fileName)\n+\n+\tdef setQueryFile(self, fileName, format):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tself.queryParser = chooser.getParser(fileName)\n+\n+\tdef setOutputFile(self, fileName):\n+\t\tself.writer = TranscriptWriter(fileName, "gff3", self.verbosity)\n+\n+\tdef setDistance(self, distance):\n+\t\tself.distance = distance\n+\n+\tdef setInvert(self, boolean):\n+\t\tself.invert = boolean\n+\n+\tdef setCollinear(self, boolean):\n+\t\tself.collinear = boolean\n+\n+\tdef setAntisense(self, boolean):\n+\t\tself.antisense = boolean\n+\n+\tdef setMinPercentOverlap(sel'..b'\tprint "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)\n+\n+\tdef run(self):\n+\t\tself.loadQuery()\n+\t\tself.compare()\n+\t\tself.printResults()\n+\t\tself.displayResults()\n+\n+if __name__ == "__main__":\n+\t\n+\tdescription = "Compare Overlapping Small Query v1.0.1: Provide the queries that overlap with a reference, when the query is small. [Category: Data Comparison]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--input1",\t dest="inputFileName1", action="store",\t\t\t type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")\n+\tparser.add_option("-f", "--format1", dest="format1",\t\t action="store",\t\t\t type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-j", "--input2",\t dest="inputFileName2", action="store",\t\t\t type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")\n+\tparser.add_option("-g", "--format2", dest="format2",\t\t action="store",\t\t\t type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-o", "--output",\t dest="outputFileName", action="store",\t\t\t type="string", help="output file [format: output file in GFF3 format]")\n+\tparser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False,\t\t\t\t help="also output not overlapping data [format: bool] [default: false]")\n+\tparser.add_option("-d", "--distance",\t\tdest="distance",\t action="store",\t default=0,\t type="int",\t help="accept some distance between query and reference [format: int]")\n+\tparser.add_option("-c", "--collinear",\t\tdest="collinear",\t action="store_true", default=False,\t\t\t \t help="provide collinear features [format: bool] [default: false]")\n+\tparser.add_option("-a", "--antisense",\t\tdest="antisense",\t action="store_true", default=False,\t\t\t \t help="provide antisense features [format: bool] [default: false]")\n+\tparser.add_option("-m", "--minOverlap",\t dest="minOverlap", action="store", default=False, type="int",\t help="min. #nt overlap [format: bool] [default: false]")\n+\tparser.add_option("-p", "--pcOverlapQuery",\tdest="pcOverlapQuery", action="store", default=False, type="int",\t help="min. % overlap of the query [format: bool] [default: false]")\n+\tparser.add_option("-P", "--pcOverlapRef",\tdest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]")\n+\tparser.add_option("-k", "--included",\t\tdest="included",\t action="store_true", default=False,\t\t\t \t help="provide query elements which are nested in reference elements [format: bool] [default: false]")\n+\tparser.add_option("-K", "--including",\t\tdest="including",\t action="store_true", default=False,\t\t\t \t help="provide query elements in which reference elements are nested [format: bool] [default: false]")\n+\tparser.add_option("-x", "--exclude",\t\tdest="exclude",\t\t action="store_true", default=False,\t\t\t \t help="invert the match [format: bool] [default: false]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity",\t action="store", default=1, type="int",\t help="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tcosq = CompareOverlappingSmallQuery(options.verbosity)\n+\tcosq.setQueryFile(options.inputFileName1, options.format1)\n+\tcosq.setReferenceFile(options.inputFileName2, options.format2)\n+\tcosq.setOutputFile(options.outputFileName)\n+\tcosq.includeNotOverlapping(options.notOverlapping)\n+\tcosq.setDistance(options.distance)\n+\tcosq.setCollinear(options.collinear)\n+\tcosq.setAntisense(options.antisense)\n+\tcosq.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef)\n+\tcosq.setMinOverlap(options.minOverlap)\n+\tcosq.setInclude(options.included, options.including)\n+\tcosq.setInvert(options.exclude)\n+\tcosq.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/CompareOverlappingSmallRef.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlappingSmallRef.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,250 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2011\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Mapping import Mapping\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+\n+MINBIN = 3\n+MAXBIN = 7\n+REFERENCE = 0\n+QUERY = 1\n+\n+def getBin(start, end):\n+\tfor i in range(MINBIN, MAXBIN + 1):\n+\t\tbinLevel = 10 ** i\n+\t\tif int(start / binLevel) == int(end / binLevel):\n+\t\t\treturn int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))\n+\treturn int((MAXBIN + 1) * 10 ** (MAXBIN + 1))\n+\n+def getOverlappingBins(start, end):\n+\tarray\t= []\n+\tbigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))\n+\tfor i in range(MINBIN, MAXBIN + 1):\n+\t\tbinLevel = 10 ** i\n+\t\tarray.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))\n+\tarray.append((bigBin, bigBin))\n+\treturn array\n+\n+\n+class CompareOverlappingSmallRef(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.verbosity = verbosity\n+\t\tself.tableNames = {}\n+\t\tself.nbQueries = 0\n+\t\tself.nbRefs\t = 0\n+\t\tself.nbWritten = 0\n+\t\tself.nbOverlaps = 0\n+\t\tself.invert = False\n+\t\tself.antisense = False\n+\t\tself.collinear = False\n+\t\tself.distance = None\n+\t\tself.minOverlap = False\n+\t\tself.pcOverlapQuery = False\n+\t\tself.pcOverlapRef = False\n+\t\tself.included = False\n+\t\tself.including = False\n+\t\tself.bins\t = {}\n+\t\tself.notOverlapping = False\n+\n+\tdef setReferenceFile(self, fileName, format):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tself.refParser = chooser.getParser(fileName)\n+\n+\tdef setQueryFile(self, fileName, format):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tself.queryParser = chooser.getParser(fileName)\n+\n+\tdef setOutputFile(self, fileName):\n+\t\tself.writer = TranscriptWriter(fileName, "gff3", self.verbosity)\n+\n+\tdef setDistance(self, distance):\n+\t\tself.distance = distance\n+\n+\tdef setCollinear(self, boolean):\n+\t\tself.collinear = boolean\n+\n+\tdef setAntisense(self, boolean):\n+\t\tself.antisense = boolean\n+\n+\tdef setInvert(self, boolean):\n+\t\tself.invert = boolean\n+\n+\tdef setMinPercentOverlap(self, pcOverlapQuery, pcOverlapRe'..b'\t\t\tprint "# refs: %d" % (self.nbRefs)\n+\t\t\tprint "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)\n+\n+\tdef run(self):\n+\t\tself.loadRef()\n+\t\tself.compare()\n+\t\tself.displayResults()\n+\n+if __name__ == "__main__":\n+\t\n+\tdescription = "Compare Overlapping Small Reference v1.0.1: Provide the queries that overlap with a reference, when the reference is small. [Category: Data Comparison]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--input1",\t dest="inputFileName1", action="store",\t\t\t type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")\n+\tparser.add_option("-f", "--format1", dest="format1",\t\t action="store",\t\t\t type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-j", "--input2",\t dest="inputFileName2", action="store",\t\t\t type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")\n+\tparser.add_option("-g", "--format2", dest="format2",\t\t action="store",\t\t\t type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-o", "--output",\t dest="outputFileName", action="store",\t\t\t type="string", help="output file [format: output file in GFF3 format]")\n+\tparser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False,\t\t\t\t help="also output not overlapping data [format: bool] [default: false]")\n+\tparser.add_option("-d", "--distance",\t\tdest="distance",\t action="store",\t default=0,\t type="int",\t help="accept some distance between query and reference [format: int]")\n+\tparser.add_option("-c", "--collinear",\t\tdest="collinear",\t action="store_true", default=False,\t\t\t \t help="provide collinear features [format: bool] [default: false]")\n+\tparser.add_option("-a", "--antisense",\t\tdest="antisense",\t action="store_true", default=False,\t\t\t \t help="provide antisense features [format: bool] [default: false]")\n+\tparser.add_option("-m", "--minOverlap",\t dest="minOverlap", action="store", default=False, type="int",\t help="min. #nt overlap [format: bool] [default: false]")\n+\tparser.add_option("-p", "--pcOverlapQuery",\tdest="pcOverlapQuery", action="store", default=False, type="int",\t help="min. % overlap of the query [format: bool] [default: false]")\n+\tparser.add_option("-P", "--pcOverlapRef",\tdest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]")\n+\tparser.add_option("-k", "--included",\t\tdest="included",\t action="store_true", default=False,\t\t\t \t help="provide query elements which are nested in reference elements [format: bool] [default: false]")\n+\tparser.add_option("-K", "--including",\t\tdest="including",\t action="store_true", default=False,\t\t\t \t help="provide query elements in which reference elements are nested [format: bool] [default: false]")\n+\tparser.add_option("-x", "--exclude",\t\tdest="exclude",\t\t action="store_true", default=False,\t\t\t \t help="invert the match [format: bool] [default: false]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity",\t action="store", default=1, type="int",\t help="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tcosr = CompareOverlappingSmallRef(options.verbosity)\n+\tcosr.setQueryFile(options.inputFileName1, options.format1)\n+\tcosr.setReferenceFile(options.inputFileName2, options.format2)\n+\tcosr.setOutputFile(options.outputFileName)\n+\tcosr.includeNotOverlapping(options.notOverlapping)\n+\tcosr.setDistance(options.distance)\n+\tcosr.setAntisense(options.antisense)\n+\tcosr.setInclude(options.included, options.including)\n+\tcosr.setInvert(options.exclude)\n+\tcosr.setMinOverlap(options.minOverlap)\n+\tcosr.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef)\n+\tcosr.run()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ComputeCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ComputeCoverage.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,142 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, random +from optparse import OptionParser, OptionGroup +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.Gff3Writer import Gff3Writer + + +class CoverageComputer(object): + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.queryReader = None + self.referenceReader = None + self.outputWriter = None + self.introns = False + self.nbNucleotides = 0 + self.nbCovered = 0 + + def setInputQueryFile(self, fileName, format): + self.queryReader = TranscriptContainer(fileName, format, self.verbosity-1) + + def setInputReferenceFile(self, fileName, format): + self.referenceReader = TranscriptContainer(fileName, format, self.verbosity-1) + + def includeIntrons(self, boolean): + self.introns = boolean + + def setOutputFileName(self, fileName, title="S-MART", feature="transcript", featurePart="exon"): + self.outputWriter = Gff3Writer(fileName, self.verbosity-1) + self.outputWriter.setTitle(title) + self.outputWriter.setFeature(feature) + self.outputWriter.setFeaturePart(featurePart) + + def readReference(self): + self.coveredRegions = {} + progress = Progress(self.referenceReader.getNbTranscripts(), "Reading reference file", self.verbosity-1) + for transcript in self.referenceReader.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + self.coveredRegions[chromosome] = {} + if self.introns: + transcript.removeExons() + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.coveredRegions[chromosome][position] = 1 + progress.inc() + progress.done() + + def readQuery(self): + progress = Progress(self.queryReader.getNbTranscripts(), "Reading query file", self.verbosity-1) + for transcript in self.queryReader.getIterator(): + progress.inc() + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + continue + if self.introns: + transcript.removeExons() + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.nbNucleotides += 1 + self.nbCovered += self.coveredRegions[chromosome].get(position, 0) + progress.done() + + def write(self): + progress = Progress(self.queryReader.getNbTranscripts(), "Writing output file", self.verbosity-1) + for transcript in self.queryReader.getIterator(): + chromosome = transcript.getChromosome() + if self.introns: + transcript.removeExons() + size = transcript.getSize() + coverage = 0 + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + coverage += self.coveredRegions[chromosome].get(position, 0) + transcript.setTagValue("coverage", 0 if size == 0 else float(coverage) / size * 100) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + + def sumUp(self): + print "%d nucleotides in query, %d (%.f%%) covered" % (self.nbNucleotides, self.nbCovered, 0 if self.nbNucleotides == 0 else float(self.nbCovered) / self.nbNucleotides * 100) + + def run(self): + self.readReference() + self.readQuery() + if self.outputWriter != None: + self.write() + self.sumUp() + + +if __name__ == "__main__": + + # parse command line + description = "Compute Coverage v1.0.1: Compute the coverage of a set with respect to another set. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input query file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of the first file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input reference file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of the second file [compulsory] [format: transcript file format]") + parser.add_option("-t", "--introns", dest="introns", action="store_true", default=False, help="also include introns [format: boolean] [default: false]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + (options, args) = parser.parse_args() + + computer = CoverageComputer(options.verbosity) + computer.setInputQueryFile(options.inputFileName1, options.format1) + computer.setInputReferenceFile(options.inputFileName2, options.format2) + computer.includeIntrons(options.introns) + computer.setOutputFileName(options.outputFileName) + computer.run() + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/CountReadGCPercent.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CountReadGCPercent.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,88 @@ +#!/usr/bin/env python + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.utils.RepetOptionParser import RepetOptionParser +from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow + + +class CountReadGCPercent(object): + + def __init__(self): + self.referenceReader = None + self.gffReader = None + self.outputWriter = None + self.verbose = 0 + + def setInputReferenceFile(self, fileName): + self.referenceReader = fileName + + def setInputGffFile(self, fileName): + self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose) + + def setOutputFileName(self, fileName): + self.outputWriter = Gff3Writer(fileName, self.verbose) + + def readGffAnnotation(self): + self.coveredRegions = {} + progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose) + for transcript in self.gffReader.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.coveredRegions: + self.coveredRegions[chromosome] = {} + for exon in transcript.getExons(): + for position in range(exon.getStart(), exon.getEnd()+1): + self.coveredRegions[chromosome][position] = 1 + progress.inc() + progress.done() + + def write(self): + iParser = FastaParser(self.referenceReader) + iParser.setTags() + iGetGCPercentBySW = CountGCPercentBySlidingWindow() + progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose) + for transcript in self.gffReader.getIterator(): + chromosome = transcript.getChromosome() + GCpercent = 0 + nPercent = 0 + for exon in transcript.getExons(): + for sequenceName in iParser.getTags().keys(): + if sequenceName != chromosome: + continue + else: + subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1) + GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence) + print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent) + transcript.setTagValue("GCpercent", GCpercent) + transcript.setTagValue("NPercent", nPercent) + self.outputWriter.addTranscript(transcript) + progress.inc() + progress.done() + + def run(self): + self.readGffAnnotation() + if self.outputWriter != None: + self.write() + +if __name__ == "__main__": + description = "Count GC percent for each read against a genome." + usage = "CountReadGCPercent.py -i <fasta file> -j <gff3 file> -o <output gff3 file> -v <verbose> -h]" + examples = "\nExample: \n" + examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3" + examples += "\n\n" + parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) + parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None ) + parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None) + parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None ) + parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 ) + (options, args) = parser.parse_args() + + readGCPercent = CountReadGCPercent() + readGCPercent.setInputReferenceFile(options.fastaFile) + readGCPercent.setInputGffFile(options.gffFile) + readGCPercent.setOutputFileName(options.outputFile) + readGCPercent.run() + \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/FindOverlapsOptim.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/FindOverlapsOptim.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,343 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2012\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+\n+import os, struct, time, shutil\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.ncList.NCList import NCList\n+from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList\n+from SMART.Java.Python.ncList.NCListParser import NCListParser\n+from SMART.Java.Python.ncList.NCListCursor import NCListCursor\n+from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle\n+from SMART.Java.Python.ncList.NCListHandler import NCListHandler\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+try:\n+ import cPickle as pickle\n+except:\n+ import pickle\n+\n+REFERENCE = 0\n+QUERY = 1\n+TYPES = (REFERENCE, QUERY)\n+TYPETOSTRING = {0: "reference", 1: "query"}\n+\n+class FindOverlapsOptim(object):\n+\t\n+\tdef __init__(self, verbosity = 1):\n+\t\tself._parsers\t\t\t\t = {}\n+\t\tself._sortedFileNames\t\t = {}\n+\t\tself._outputFileName\t\t = "outputOverlaps.gff3"\n+\t\tself._iWriter\t\t\t\t = None\n+\t\tself._inputFileNames\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._convertedFileNames = {REFERENCE: False, QUERY: False}\n+\t\tself._inputFileFormats\t\t = {REFERENCE: None, QUERY: None}\n+\t\tself._converted\t\t\t = {REFERENCE: False, QUERY: False}\n+\t\tself._ncListHandlers = {REFERENCE: None, QUERY: None}\n+\t\tself._splittedFileNames\t = {REFERENCE: {},\tQUERY: {}}\n+\t\tself._nbOverlappingQueries\t = 0\n+\t\tself._nbOverlaps\t\t\t = 0\n+\t\tself._nbLines\t\t\t\t = {REFERENCE: 0, QUERY: 0}\n+\t\tself._sorted = False\n+\t\tself._index = False\n+\t\tself._verbosity\t\t\t = verbosity\n+\t\tself._ncLists\t\t\t\t = {}\n+\t\tself._cursors\t\t\t\t = {}\n+\t\tself._nbElementsPerChromosome = {}\n+\t\tself._tmpDirectories\t\t = {REFERENCE: False, QUERY: False}\n+\t\t\n+\tdef close(self):\n+\t\tself._iWriter.close()\n+\t\tfor fileName in (self._sortedFileNames.values()):\n+\t\t\tif os.path.exists(fileName):\n+\t\t\t\tos.remove(fileName)\n+\t\tfor fileName in self._convertedFileNames.values():\n+\t\t\tif fileName:\n+\t\t\t\tos.remove(fileName)\n+\t\t\n+\tdef setRefFileName(self, fileName, format):\n+\t\tself.setFileName(fileName, format, REFERENCE)\n+\t\t\n+\tdef setQueryFileName(self, fileName, format):\n+\t\tself.setFileName(fileName, format, QUERY)\n+\n+\tdef se'..b'def isOverlapping(self, queryTranscript, refTranscript):\n+\t\tif (queryTranscript.getStart() <= refTranscript.getEnd() and queryTranscript.getEnd() >= refTranscript.getStart()):\n+\t\t\treturn 0 \n+\t\tif queryTranscript.getEnd() < refTranscript.getStart():\n+\t\t\treturn 1\n+\t\treturn -1\n+\n+\tdef checkIndex(self, transcript, cursor):\n+\t\tif not self._index:\n+\t\t\treturn None\n+\t\tchromosome = transcript.getChromosome()\n+\t\tnextLIndex = self._indices[REFERENCE][chromosome].getIndex(transcript)\n+\t\tif nextLIndex == None:\n+\t\t\treturn None\n+\t\tncList\t\t = self._ncLists[REFERENCE][chromosome]\n+\t\tnextGffAddress = ncList.getRefGffAddr(nextLIndex)\n+\t\tthisGffAddress = cursor.getGffAddress()\n+\t\tif nextGffAddress > thisGffAddress:\n+\t\t\treturn nextLIndex\n+\t\treturn None\n+\t\t\n+\tdef _writeIntervalInNewGFF3(self, transcript, names):\n+\t\tnbOverlaps = 0\n+\t\tfor cpt in names.values():\n+\t\t\tnbOverlaps += cpt\n+\t\tif not names:\n+\t\t\treturn\n+\t\ttranscript.setTagValue("overlapsWith", "--".join(sorted(names.keys())))\n+\t\ttranscript.setTagValue("nbOverlaps", nbOverlaps)\n+\t\tself._iWriter.addTranscript(transcript)\n+\t\tself._iWriter.write()\n+\t\tself._nbOverlappingQueries += 1\n+\t\tself._nbOverlaps\t\t += nbOverlaps\n+\t\t\n+\tdef _extractID(self, transcript):\n+\t\tnbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1\n+\t\tid\t\t = transcript.getTagValue("ID")\t\t\t\t if "ID"\t\t in transcript.getTagNames() else transcript.getUniqueName()\n+\t\treturn {id: nbElements}\n+\t\t\n+\tdef run(self):\n+\t\tself.createNCLists()\n+\t\tself.compare()\n+\t\tself.close()\n+\t\tif self._verbosity > 0:\n+\t\t\tprint "# queries: %d" % (self._nbLines[QUERY])\n+\t\t\tprint "# refs: %d" % (self._nbLines[REFERENCE])\n+\t\t\tprint "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)\n+\t\t\tprint "time: %.2gs" % (self._timeSpent)\n+\n+\n+if __name__ == "__main__":\n+\tdescription = "Find Overlaps Optim v1.0.0: Finds overlaps with several query intervals. [Category: Data Comparison]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--query",\t dest="inputQueryFileName", action="store",\t\t\t type="string", help="query input file [compulsory] [format: file in transcript or other format given by -f]")\n+\tparser.add_option("-f", "--queryFormat", dest="queryFormat",\t\taction="store",\t\t\t type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")\n+\tparser.add_option("-j", "--ref",\t\t dest="inputRefFileName", action="store",\t\t\t type="string", help="reference input file [compulsory] [format: file in transcript or other format given by -g]")\n+\tparser.add_option("-g", "--refFormat", dest="refFormat",\t\t action="store",\t\t\t type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")\n+\tparser.add_option("-o", "--output",\t dest="outputFileName",\t action="store",\t\t\t type="string", help="output file [compulsory] [format: output file in GFF3 format]")\n+\tparser.add_option("-d", "--index",\t dest="index",\t action="store_true", default=False,\t help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]")\n+\tparser.add_option("-s", "--sorted",\t dest="sorted",\t action="store_true", default=False,\t help="input files are already sorted [format: boolean] [default: False]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity",\t\t action="store", default=1, type="int",\t help="Trace level [format: int] [default: 1]")\n+\t(options, args) = parser.parse_args()\n+\t\n+\tiFOO = FindOverlapsOptim(options.verbosity)\n+\tiFOO.setRefFileName(options.inputRefFileName, options.refFormat)\n+\tiFOO.setQueryFileName(options.inputQueryFileName, options.queryFormat)\n+\tiFOO.setOutputFileName(options.outputFileName)\n+\tiFOO.setIndex(options.index)\n+\tiFOO.setSorted(options.sorted)\n+\tiFOO.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/GetDifferentialExpression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetDifferentialExpression.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,441 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Get the differential expression between 2 conditions (2 files), on regions defined by a third file"""\n+\n+import os, re\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc import Utils\n+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection\n+from SMART.Java.Python.structure.Transcript import Transcript\n+\n+class GetDifferentialExpression(object):\n+ \n+ def __init__(self, verbosity = 1):\n+ self.verbosity = verbosity\n+ self.mySqlConnection = MySqlConnection(verbosity)\n+ self.inputs = (0, 1)\n+ self.transcriptContainers = [None, None]\n+ self.transcriptContainerRef = None\n+ self.outputFileName = None\n+ self.writer = None\n+ self.tables = [None, None]\n+ self.nbElements = [0, 0]\n+\n+ self.regionsToValues = {}\n+ self.regionsToNames = {}\n+ self.valuesToPvalues = {}\n+\n+ self.oriented = True\n+ self.simpleNormalization = False\n+ self.simpleNormalizationParameters = None\n+ self.adjustedNormalization = False\n+ self.fixedSizeFactor = None\n+ self.normalizationSize = None\n+ self.normalizationFactors = [1, 1]\n+ self.fdr = None \n+ self.fdrPvalue = None \n+\n+ self.plot = False\n+ self.plotter = None\n+ self.plotterName = None\n+ self.points = {}\n+\n+\n+ def setInputFile(self, i, fileName, fileFormat):\n+ self.transcriptContainers[i] = TranscriptContainer(fileName, fileFormat, self.verbosity)\n+ self.transcriptContainers[i].mySqlConnection = self.mySqlConnection\n+\n+\n+ def setReferenceFile(self, fileName, fileFormat):\n+ self.transcriptContainerRef = TranscriptContainer(fileName, fileFormat, self.verbosity)\n+ self.transcriptContainerRef.mySqlConnection = self.mySqlConnection\n+\n+\n+ def setOutputFile(self, fileName):\n+ self.outputFileName = fileName\n+ self.writer = Gff3Writer(fileName, self.verbosity)\n+\n+ \n+ def setOriented(self'..b' file in transcript format given by -f]")\n+ parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")\n+ parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")\n+ parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]")\n+ parser.add_option("-k", "--reference", dest="referenceFileName", action="store", type="string", help="reference file [compulsory] [format: file in transcript format given by -l]")\n+ parser.add_option("-l", "--referenceFormat", dest="referenceFormat", action="store", type="string", help="format of reference file [compulsory] [format: transcript file format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in gff3 format]")\n+ parser.add_option("-n", "--notOriented", dest="notOriented", action="store_true", default=False, help="if the reads are not oriented [default: False] [format: bool]")\n+ parser.add_option("-s", "--simple", dest="simple", action="store_true", default=False, help="normalize using the number of reads in each condition [format: bool]")\n+ parser.add_option("-S", "--simpleParameters", dest="simpleParameters", action="store", default=None, type="string", help="provide the number of reads [format: bool]")\n+ parser.add_option("-a", "--adjusted", dest="adjusted", action="store_true", default=False, help="normalize using the number of reads of \'mean\' regions [format: bool]")\n+ parser.add_option("-x", "--fixedSizeFactor", dest="fixedSizeFactor", action="store", default=None, type="int", help="give the magnification factor for the normalization using fixed size sliding windows in reference regions (leave empty for no such normalization) [format: int]")\n+ parser.add_option("-d", "--fdr", dest="fdr", action="store", default=None, type="float", help="use FDR [format: float]")\n+ parser.add_option("-p", "--plot", dest="plotName", action="store", default=None, type="string", help="plot cloud plot [format: output file in PNG format]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+ (options, args) = parser.parse_args()\n+\n+\n+ \n+ differentialExpression = GetDifferentialExpression(options.verbosity)\n+ differentialExpression.setInputFile(0, options.inputFileName1, options.format1)\n+ differentialExpression.setInputFile(1, options.inputFileName2, options.format2)\n+ differentialExpression.setReferenceFile(options.referenceFileName, options.referenceFormat)\n+ differentialExpression.setOutputFile(options.outputFileName)\n+ if options.plotName != None :\n+ differentialExpression.setPlotterName(options.plotName)\n+ differentialExpression.setPlotter()\n+ differentialExpression.setOriented(not options.notOriented)\n+ differentialExpression.setSimpleNormalization(options.simple)\n+ differentialExpression.setSimpleNormalizationParameters(options.simpleParameters)\n+ differentialExpression.setAdjustedNormalization(options.adjusted)\n+ differentialExpression.setFixedSizeNormalization(options.fixedSizeFactor)\n+ differentialExpression.setFdr(options.fdr)\n+ differentialExpression.getDifferentialExpression()\n+ differentialExpression.mySqlConnection.deleteDatabase()\n+ \n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/GetDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetDistribution.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,362 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2012\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.parsing.FastaParser import FastaParser\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc.MultipleRPlotter import MultipleRPlotter\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+TWOSTRANDS = {True: [1, -1], False: [0]}\n+STRANDTOSTR = {1: "(+)", -1: "(-)", 0: ""}\n+\n+class GetDistribution(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.verbosity = verbosity\n+\t\tself.sizes = None\n+\t\tself.twoStrands = False\n+\t\tself.start = 1\n+\t\tself.names = ["nbElements"]\n+\t\tself.average = False\n+\t\tself.nbValues = {}\n+\t\tself.height = 300\n+\t\tself.width = 600\n+\t\tself.colors = None\n+\t\tself.gffFileName = None\n+\t\tself.csvFileName = None\n+\t\tself.yMin = None\n+\t\tself.yMax = None\n+\t\tself.chromosome = None\n+\t\tself.merge = False\n+\t\tself.nbTranscripts = None\n+\n+\tdef setInputFile(self, fileName, format):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tself.parser = chooser.getParser(fileName)\n+\n+\tdef setReferenceFile(self, fileName):\n+\t\tif fileName == None:\n+\t\t\treturn\n+\t\tfastaParser = FastaParser(fileName, self.verbosity)\n+\t\tself.chromosomes = fastaParser.getRegions()\n+\t\tself.sizes = dict([region, fastaParser.getSizeOfRegion(region)] for region in self.chromosomes)\n+\t\tself.maxSize = max(self.sizes.values())\n+\n+\tdef setRegion(self, chromosome, start, end):\n+\t\tif chromosome == None:\n+\t\t\treturn\n+\t\tself.maxSize = options.end\n+\t\tself.sizes = {chromosome: end}\n+\t\tself.chromosomes = [chromosome]\n+\t\tself.chromosome = chromosome\n+\t\tself.start = start\n+\t\tself.end = end\n+\n+\tdef setOutputFile(self, fileName):\n+\t\tself.outputFileName = fileName\n+\n+\tdef setNbBins(self, nbBins):\n+\t\tself.nbBins = nbBins\n+\n+\tdef set2Strands(self, twoStrands):\n+\t\tself.twoStrands = twoStrands\n+\n+\tdef setNames(self, names):\n+\t\tself.names = names\n+\n+\tdef setAverage(self, average):\n+\t\tself.average = average\n+\n+\tdef setNormalization(self, normalization):\n+\t\tself.normalization = normalization\n+\t\n+\tdef setImageSize(self, height, width):\n+\t\tself.height = height\n+\t\tself.width '..b' action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]")\n+\tparser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]")\n+\tparser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]")\n+\tparser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]")\n+\tparser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]")\n+\tparser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]")\n+\tparser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]")\n+\tparser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]")\n+\tparser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]")\n+\tparser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]")\n+\tparser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]")\n+\tparser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]")\n+\tparser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]")\n+\tparser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]")\n+\tparser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]")\n+\tparser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tgt = GetDistribution(options.verbosity)\n+\tgt.setInputFile(options.inputFileName, options.format)\n+\tgt.setOutputFile(options.outputFileName)\n+\tgt.setReferenceFile(options.referenceFileName)\n+\tgt.setNbBins(int(options.nbBins))\n+\tgt.set2Strands(options.bothStrands)\n+\tgt.setRegion(options.chromosome, options.start, options.end)\n+\tgt.setNormalization(options.normalize)\n+\tgt.setAverage(options.average)\n+\tgt.setYLimits(options.yMin, options.yMax)\n+\tgt.writeCsv(options.csv)\n+\tgt.writeGff(options.gff)\n+\tgt.setImageSize(options.height, options.width)\n+\tgt.setNames(options.names.split(","))\n+\tgt.setColors(None if options.colors == None else options.colors.split(","))\n+\tgt.setNormalization(options.normalize)\n+\tgt.mergePlots(options.mergePlots)\n+\tgt.run()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/GetFlanking.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/GetFlanking.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,233 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2011\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+QUERY = 0\n+REFERENCE = 1\n+INPUTS = (QUERY, REFERENCE)\n+STRANDS = (-1, 1)\n+TAG_DISTANCE = "distance_"\n+TAG_SENSE = "_sense"\n+TAG_REGION = "_region"\n+TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"}\n+TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}\n+TAGS_SENSE = {-1: "antisense", 0: "", 1: "collinear"}\n+STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}\n+\n+\n+def getOrderKey(transcript, direction, input):\n+\tif direction == 1:\n+\t\tif input == QUERY:\n+\t\t\treturn (transcript.getEnd(), -transcript.getStart())\n+\t\treturn (transcript.getStart(), -transcript.getEnd())\n+\tif input == QUERY:\n+\t\treturn (-transcript.getStart(), transcript.getEnd())\n+\treturn (-transcript.getEnd(), transcript.getStart())\n+\n+\n+class GetFlanking(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.verbosity = verbosity\n+\t\tself.transcripts = dict([id, {}] for id in INPUTS)\n+\t\tself.directions = []\n+\t\tself.noOverlap = False\n+\t\tself.colinear = False\n+\t\tself.antisense = False\n+\t\tself.distance = None\n+\t\tself.minDistance = None\n+\t\tself.maxDistance = None\n+\t\tself.tagName = "flanking"\n+\n+\tdef setInputFile(self, fileName, format, id):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tparser = chooser.getParser(fileName)\n+\t\tfor transcript in parser.getIterator():\n+\t\t\tchromosome = transcript.getChromosome()\n+\t\t\tif chromosome not in self.transcripts[id]:\n+\t\t\t\tself.transcripts[id][chromosome] = []\n+\t\t\tself.transcripts[id][chromosome].append(transcript)\n+\n+\tdef setOutputFile(self, fileName):\n+\t\tself.writer = TranscriptWriter(fileName, "gff3", self.verbosity)\n+\n+\tdef addUpstreamDirection(self, upstream):\n+\t\tif upstream:\n+\t\t\tself.directions.append(-1)\n+\n+\tdef addDownstreamDirection(self, downstream):\n+\t\tif downstream:\n+\t\t\tself.directions.append(1)\n+\n+\tdef setColinear(self, colinear):\n+\t\tself.colinear = colinear\n+\n+\tdef setAntisense(self, antisense):\n+\t\tself.antisense = antisense\n+\n+\tdef setNoOverlap(self, noOverlap):\n+\t\tself.noOverlap = noOverlap\n+\n+\tdef setMinDistance(self, distance):\n+\t\tself.minDistance = distance\n+\n+\tdef setMaxDistance(se'..b'scriptRef: transcriptQuery.getDistance(transcriptRef))[0]\n+\t\t\t\tself.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0))\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\n+\tdef run(self):\n+\t\tfor chromosome in sorted(self.transcripts[QUERY].keys()):\n+\t\t\tself.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome])\n+\t\t\tfor direction in STRANDS:\n+\t\t\t\t#print "comparison", chromosome, direction\n+\t\t\t\tself.getFlanking(chromosome, direction)\n+\t\t\tself.write()\n+\t\tself.writer.close()\n+\n+if __name__ == "__main__":\n+\t\n+\tdescription = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")\n+\tparser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")\n+\tparser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]")\n+\tparser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]")\n+\tparser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]")\n+\tparser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]")\n+\tparser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")\n+\tparser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]")\n+\tparser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]")\n+\tparser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")\n+\tparser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tgf = GetFlanking(options.verbosity)\n+\tgf.setInputFile(options.inputFileName1, options.format1, QUERY)\n+\tgf.setInputFile(options.inputFileName2, options.format2, REFERENCE)\n+\tgf.setOutputFile(options.outputFileName)\n+\tgf.addUpstreamDirection(options.upstream)\n+\tgf.addDownstreamDirection(options.downstream)\n+\tgf.setColinear(options.colinear)\n+\tgf.setAntisense(options.antisense)\n+\tgf.setNoOverlap(options.noOverlap)\n+\tgf.setMinDistance(options.minDistance)\n+\tgf.setMaxDistance(options.maxDistance)\n+\tgf.setNewTagName(options.tagName)\n+\tgf.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/SelectByTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/SelectByTag.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,148 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Select the transcript such that a tag value is not less than a given threshold""" +import os +import sys +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer import MySqlTranscriptWriter +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + +class SelectByTag(object): + + def __init__(self, verbosity = 1): + self.input = None + self.format = None + self.tag = None + self.value = None + self.min = None + self.max = None + self.default = None + self.output = None + self.mysql = None + self.verbosity = verbosity + + self.parser = None + self.writer = None + self.mysqlWriter = None + self.nbElements = None + self.nbWritten = 0 + + + def setParser(self): + self.parser = TranscriptContainer(self.input, self.format, self.verbosity) + self.nbElements = self.parser.getNbTranscripts() + + + def setWriter(self): + self.writer = Gff3Writer(self.output, self.verbosity) + if self.mysql: + self.mysqlWriter = MySqlTranscriptWriter(self.output, self.verbosity) + + + def isAccepted(self, transcript): + value = transcript.getTagValue(self.tag) + if value == None: + if self.default != None: + value = self.default + else: + raise Exception("Error! Transcript %s no tag called '%s'" % (transcript, self.tag)) + if self.value != None: + if self.value == str(value): + return True + return self.value.isdigit() and value == float(self.value) + value = float(value) + return (self.min == None or self.min <= value) and (self.max == None or self.max >= value) + + + def readInputFile(self): + progress = Progress(self.parser.getNbTranscripts(), "Writing transcripts", self.verbosity) + for transcript in self.parser.getIterator(): + if self.isAccepted(transcript): + self.writer.addTranscript(transcript) + if self.mysql: + self.mysqlWriter.addTranscript(transcript) + self.nbWritten += 1 + progress.inc() + progress.done() + + + def writeFile(self): + self.writer.write() + if self.mysql: + self.mysqlWriter.write() + + + def run(self): + self.setParser() + self.setWriter() + self.readInputFile() + self.writeFile() + if self.verbosity > 0: + print "%d input" % (self.nbElements) + if self.nbElements != 0: + print "%d output (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbElements * 100) + + + +if __name__ == "__main__": + + # parse command line + description = "Select by Tag v1.0.2: Keep the genomic coordinates such that a the value of a given tag is between two limits. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="the tag [compulsory] [format: string]") + parser.add_option("-a", "--value", dest="value", action="store", default=None, type="string", help="the value to be found [format: string]") + parser.add_option("-m", "--min", dest="min", action="store", default=None, type="float", help="the minimum threshold [format: float]") + parser.add_option("-M", "--max", dest="max", action="store", default=None, type="float", help="the maximum threshold [format: float]") + parser.add_option("-d", "--default", dest="default", action="store", default=None, type="float", help="value if tag is not present [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="write output into MySQL tables [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + selectByTag = SelectByTag(options.verbosity) + selectByTag.input = options.inputFileName + selectByTag.format = options.format + selectByTag.tag = options.tag + selectByTag.value = options.value + selectByTag.min = options.min + selectByTag.max = options.max + selectByTag.default = options.default + selectByTag.output = options.outputFileName + selectByTag.mysql = options.mysql + selectByTag.run() + + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/WrappGetDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappGetDistribution.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,96 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") + parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") + parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappGetDistribution" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/getDistribution.py -i %s -f %s -o %s -D %s" % (SMART_PATH, options.inputFileName, options.format, outputFileName, directory) + if options.referenceFileName != None : + cmd += " -r %s" % options.referenceFileName + if options.nbBins != None : + cmd += " -n %s" % options.nbBins + if options.chromosome : + cmd += " -c %s" % options.chromosome + if options.start != None : + cmd += " -s %s" % options.start + if options.end != None : + cmd += " -e %s" % options.end + if options.yMin != None : + cmd += " -y %s" % options.yMin + if options.yMax != None : + cmd += " -Y %s" % options.yMax + if options.height != None : + cmd += " -H %s" % options.height + if options.width != None : + cmd += " -W %s" % options.width + if options.bothStrands : + cmd += " -2" + if options.raw : + cmd += " -w" + if options.csv : + cmd += " -x" + if options.gff : + cmd += " -g" + if options.log : + cmd += " -l" + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/WrappGetReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappGetReadDistribution.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,58 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + +if __name__ == "__main__": + + # parse command line + description = "Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file sequence [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="keep the best n [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="float", help="keep the best n\% [format: float]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: zip]") + + (options, args) = parser.parse_args() + + + absPath = os.getcwd() + print "the current path is :", absPath + directory = "/tmp/wrappGetReadDistribution" + print "the dir path is :", directory + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName != None and options.format != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + cmd = "python %s/Java/Python/getReadDistribution.py -i %s -f %s -o %s -D %s" % (SMART_PATH, options.inputFileName, options.format, outputFileName, directory) + if options.number != None : + cmd += " -n %s" % options.number + if options.percent != None : + cmd += " -p %s" % options.percent + print "cmd is: ", cmd + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/WrappPlotCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/WrappPlotCoverage.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,89 @@ +#! /usr/bin/env python +from optparse import OptionParser +import tarfile +import os +import re +import shutil +import subprocess + +SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] + +def toTar(tarFileName, directory): + fileName = os.path.splitext(tarFileName)[0] + fileNameBaseName = os.path.basename(fileName) + tfile = tarfile.open(fileName + ".tmp.tar", "w") + list = os.listdir(directory) + for file in list: + if re.search(str(fileNameBaseName), file): + tfile.add(file) + os.system("mv %s %s" % (fileName + ".tmp.tar", options.outTarFileName)) + tfile.close() + + + +if __name__ == "__main__": + + # parse command line + description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") + parser.add_option("-o", "--output", dest="outTarFileName", action="store", type="string", help="output file [compulsory] [format: output file in zip format]") + parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") + parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") + parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") + parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") + parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") + parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") + parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") + parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") + parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") + parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + absPath = os.getcwd() + directory = "/tmp/wrappPlotCov" + if not os.path.exists(directory): + os.makedirs(directory) + os.chdir(directory) + if options.inputFileName1 != None and options.inputFormat1 != None and options.inputFileName2 != None and options.inputFormat2 != None and options.outTarFileName != None: + outputFileName = os.path.splitext(os.path.basename(options.outTarFileName))[0] + print 'outputfile is :', outputFileName + cmd = "python %s/Java/Python/plotCoverage.py -i %s -f %s -j %s -g %s -o %s -D %s" % (SMART_PATH, options.inputFileName1, options.inputFormat1, options.inputFileName2, options.inputFormat2, outputFileName, directory) + if options.inputSequence!= None: + cmd += " -q %s" % options.inputSequence + if options.width != None: + cmd += " -w %s" % options.width + if options.height != None: + cmd += " -e %s" % options.height + if options.title != None: + cmd += " -t %s" % options.title + if options.xLabel != None: + cmd += " -x %s" % options.xLabel + if options.yLabel != None: + cmd += " -y %s" % options.yLabel + if options.plusColor != None: + cmd += " -p %s" % options.plusColor + if options.minusColor != None: + cmd += " -m %s" % options.minusColor + if options.sumColor != None: + cmd += " -s %s" % options.sumColor + if options.lineColor != None: + cmd += " -l %s" % options.lineColor + if options.merge: + cmd += " -1" + status = subprocess.call(cmd, shell=True) + if status != 0: + raise Exception("Problem with the execution of command!") + toTar(options.outTarFileName, directory) + shutil.rmtree(directory) + + + + + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/changeGffFeatures.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/changeGffFeatures.sh Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,2 @@ +#!/bin/bash +sed "s/\t$2\t/\t$3\t/g" $1 |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/changeTagName.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/changeTagName.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,90 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Change the name of a tag +""" + +import os +import random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from commons.core.writer.Gff3Writer import Gff3Writer + + +if __name__ == "__main__": + + # parse command line + description = "Change Tag Name v1.0.1: Change the name of tag of a list of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="name of the tag to change [compulsory] [format: string]") + parser.add_option("-n", "--name", dest="name", action="store", type="string", help="new name for the tag [compulsory] [format: string]") + parser.add_option("-y", "--mysql", dest="mysql", action="store_true", default=False, help="mySQL output [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.log: + logHandle = open("%s.log" % options.outputFileName, "w") + + # create parser and writer(s) + parser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + tmpFileName = "tmpTranscriptFile%d.gff3" % (random.randint(0, 100000)) + writer = Gff3Writer(tmpFileName, options.verbosity) + if options.mysql: + mysqlWriter = MySqlTranscriptWriter(options.outputFileName, options.verbosity) + outputData = {} + + # process transcripts + progress = Progress(parser.getNbTranscripts(), "Printing transcripts %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + if options.tag in transcript.tags: + value = transcript.tags[options.tag] + del transcript.tags[options.tag] + transcript.tags[options.name] = value + writer.addTranscript(transcript) + if options.mysql: + mysqlWriter.addTranscript(transcript) + progress.inc() + progress.done() + parser.transcriptListParser.close() + + writer.write() + + if options.mysql: + mysqlWriter.write() + + os.rename(tmpFileName, options.outputFileName) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/cleaning/CleanerChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/CleanerChooser.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,80 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.cleaning.GffCleaner import GffCleaner +from SMART.Java.Python.cleaning.GtfCleaner import GtfCleaner +from SMART.Java.Python.cleaning.DefaultCleaner import DefaultCleaner + +#Attention!! Do not delete the imports!! They are used to know the type of file format!!! + +class CleanerChooser(object): + """ + A class that finds the correct cleaner + @ivar format: the format + @type format: string + @ivar cleaner: the parser + @type cleaner: object + @ivar cleanerClass: the class of the parser + @type cleanerClass: class + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.verbosity = verbosity + + + def findFormat(self, format): + """ + Find the correct parser + @ivar format: the format + @type format: string + @return: a cleaner + """ + for cleanerClass in TranscriptListCleaner.__subclasses__(): + if cleanerClass != None: + if cleanerClass.getFileFormats() != None and format in cleanerClass.getFileFormats(): + self.cleanerClass = cleanerClass + return + self.cleanerClass = DefaultCleaner + + + def getCleaner(self): + """ + Get the parser previously found + @return: the parser + """ + return self.cleanerClass(self.verbosity) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/cleaning/DefaultCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/DefaultCleaner.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,45 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Default cleaner. Does nothing but copying. +""" +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +class DefaultCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(DefaultCleaner, self).__init__(verbosity) + + def _clean(self): + self.outputHandle.write(self.inputHandle.read()) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/cleaning/GffCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/GffCleaner.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,168 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GFF file (as given by NCBI or TAIR) and outputs a GFF3 file. +""" + +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + self.getId() + self.getParents() + + def parseOptions(self): + self.parsedOptions = {} + for option in self.splittedLine[8].split(";"): + option = option.strip() + if option == "": continue + posSpace = option.find(" ") + posEqual = option.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + key, value = option.split("=", 1) + elif posSpace != -1: + key, value = option.split(None, 1) + else: + key = "ID" + value = option + self.parsedOptions[key.strip()] = value.strip(" \"") + + def getId(self): + for key in self.parsedOptions: + if key.lower() == "id": + self.id = self.parsedOptions[key] + return + if "Parent" in self.parsedOptions: + parent = self.parsedOptions["Parent"].split(",")[0] + if parent not in count: + count[parent] = {} + if self.type not in count[parent]: + count[parent][self.type] = 0 + count[parent][self.type] += 1 + self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) + else: + self.id = "smart%d" % (self.cpt) + self.parsedOptions["ID"] = self.id + + def getParents(self): + for key in self.parsedOptions: + if key.lower() in ("parent", "derives_from"): + self.parents = self.parsedOptions[key].split(",") + return + self.parents = None + + def removeParent(self): + for key in self.parsedOptions.keys(): + if key.lower() in ("parent", "derives_from"): + del self.parsedOptions[key] + + def export(self): + self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) + return "%s\n" % ("\t".join(self.splittedLine)) + + +class GffCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(GffCleaner, self).__init__(verbosity) + self.lines = {} + self.acceptedTypes = ["mRNA", "transcript", "exon"] + self.parents = [] + self.children = {} + + def getFileFormats(): + return ["gff", "gff2", "gff3"] + getFileFormats = staticmethod(getFileFormats) + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputHandle): + if not line or line[0] == "#": continue + if line[0] == ">": break + parsedLine = ParsedLine(line, cpt) + if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: + self.lines[parsedLine.id] = parsedLine + progress.inc() + progress.done() + + def sort(self): + progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) + for line in self.lines.values(): + parentFound = False + if line.parents: + for parent in line.parents: + if parent in self.lines: + parentFound = True + if parent in self.children: + self.children[parent].append(line) + else: + self.children[parent] = [line] + if not parentFound: + line.removeParent() + self.parents.append(line) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents), "Writing output file", self.verbosity) + for line in self.parents: + self.writeLine(line) + progress.inc() + progress.done() + + def writeLine(self, line): + self.outputHandle.write(line.export()) + if line.id in self.children: + for child in self.children[line.id]: + self.writeLine(child) + + def _clean(self): + self.parse() + self.sort() + self.write() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/cleaning/GtfCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/GtfCleaner.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,121 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Clean a GTF file +""" + +import shlex +from SMART.Java.Python.cleaning.TranscriptListCleaner import TranscriptListCleaner +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +count = {} + +class ParsedLine(object): + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() + + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + + def parseOptions(self): + self.parsedOptions = {} + key = None + value = "" + for option in shlex.split(self.splittedLine[8]): + option = option.strip() + if option == "": continue + if key == None: + key = option + else: + endValue = False + if option[-1] == ";": + endValue = True + option.rstrip(";") + value = "%s \"%s\"" % (value, option) + if endValue: + self.parsedOptions[key] = value + if key == "transcript_id": + self.transcriptId = value + key = None + value = "" + + def export(self): + return "%s\n" % (self.line) + + +class GtfCleaner(TranscriptListCleaner): + + def __init__(self, verbosity = 1): + super(GtfCleaner, self).__init__(verbosity) + self.acceptedTypes = ["exon"] + self.parents = {} + + def getFileFormats(): + return ["gtf"] + getFileFormats = staticmethod(getFileFormats) + + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputHandle): + if not line or line[0] == "#": continue + parsedLine = ParsedLine(line, cpt) + if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: + transcriptId = parsedLine.transcriptId + if transcriptId not in self.parents: + self.parents[parsedLine.transcriptId] = [parsedLine] + else: + self.parents[parsedLine.transcriptId].append(parsedLine) + progress.inc() + progress.done() + + def write(self): + progress = Progress(len(self.parents.keys()), "Writing output file", self.verbosity) + for parent in sorted(self.parents.keys()): + for line in self.parents[parent]: + self.outputHandle.write(line.export()) + progress.inc() + progress.done() + + def _clean(self): + self.parse() + self.write() + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/cleaning/TranscriptListCleaner.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/cleaning/TranscriptListCleaner.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,63 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class TranscriptListCleaner(object): + """A (quite generic) class that cleans a file containing transcripts""" + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + + def setInputFileName(self, fileName): + try: + self.inputHandle = open(fileName) + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + + def setOutputFileName(self, fileName): + try: + self.outputHandle = open(fileName, "w") + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + + def getFileFormats(): + pass + getFileFormats = staticmethod(getFileFormats) + + def close(self): + self.inputHandle.close() + self.outputHandle.close() + + def clean(self): + self._clean() + self.close() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/clusterize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/clusterize.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,185 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from commons.core.writer.WriterChooser import WriterChooser\n+"""Clusterize a set of transcripts"""\n+\n+import os, os.path, random\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle\n+from SMART.Java.Python.ncList.FileSorter import FileSorter\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+\n+class Clusterize(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.normalize\t\t = False\n+\t\tself.presorted\t\t = False\n+\t\tself.distance\t\t = 1\n+\t\tself.colinear\t\t = False\n+\t\tself.nbWritten\t\t = 0\n+\t\tself.nbMerges\t\t = 0\n+\t\tself.verbosity\t\t = verbosity\n+\t\tself.splittedFileNames = {}\n+\n+\tdef __del__(self):\n+\t\tfor fileName in self.splittedFileNames.values():\n+\t\t\tos.remove(fileName)\n+\n+\tdef setInputFile(self, fileName, format):\n+\t\tparserChooser = ParserChooser(self.verbosity)\n+\t\tparserChooser.findFormat(format)\n+\t\tself.parser = parserChooser.getParser(fileName)\n+\t\tself.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))\n+\t\tif "SMARTTMPPATH" in os.environ:\n+\t\t\tself.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName))\n+\n+\tdef setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):\n+\t\twriterChooser = WriterChooser()\n+\t\twriterChooser.findFormat(format)\n+\t\tself.writer = writerChooser.getWriter(fileName)\n+\t\tself.writer.setTitle(title)\n+\t\tself.writer.setFeature(feature)\n+\t\tself.writer.setFeaturePart(featurePart)\n+\n+\tdef setDistance(self, distance):\n+\t\tself.distance = distance\n+\n+\tdef setColinear(self, colinear):\n+\t\tself.colinear = colinear\n+\n+\tdef setNormalize(self, normalize):\n+\t\tself.normalize = normalize\n+\t\t\n+\tdef setPresorted(self, presorted):\n+\t\tself.presorted = presorted\n+\n+\tdef _sortFile(self):\n+\t\tif self.presorted:\n+\t\t\treturn\n+\t\tfs = FileSorter(self.parser, self.verbosity-4)\n+\t\tfs.perChromosome(True)\n+\t\tfs.setPresorted(self.presorted)\n+\t\tfs.setOutputFileName(self.sortedFileName)\n+\t\tfs.sort()\n+\t\tself.splittedFileNames = fs.getOutputFileNames()\n+\t\tself.nbElementsPerChromosome = fs.getNbElementsPerChromosome()\n+\t\tself.nbElements = fs.getNbElements()\n+\t\t\n+\tdef _iterat'..b'ipts = []\n+\t\t\tif newTranscript.__class__.__name__ == "Mapping":\n+\t\t\t\tnewTranscript = newTranscript.getTranscript()\n+\t\t\tfor oldTranscript in transcripts:\n+\t\t\t\tif self._checkOverlap(newTranscript, oldTranscript):\n+\t\t\t\t\tself._merge(newTranscript, oldTranscript)\n+\t\t\t\telif self._checkPassed(newTranscript, oldTranscript):\n+\t\t\t\t\tself._write(oldTranscript)\n+\t\t\t\telse:\n+\t\t\t\t\tnewTranscripts.append(oldTranscript)\n+\t\t\tnewTranscripts.append(newTranscript)\n+\t\t\ttranscripts = newTranscripts\n+\t\t\tself.nbElements += 1\n+\t\t\tprogress.inc()\n+\t\tfor transcript in transcripts:\n+\t\t\tself._write(transcript)\n+\t\tprogress.done()\n+\n+\tdef _merge(self, transcript1, transcript2):\n+\t\tself.nbMerges += 1\n+\t\ttranscript2.setDirection(transcript1.getDirection())\n+\t\ttranscript1.merge(transcript2)\n+\n+\tdef _write(self, transcript):\n+\t\tself.nbWritten += 1\n+\t\tself.writer.addTranscript(transcript)\n+\n+\tdef _checkOverlap(self, transcript1, transcript2):\n+\t\tif transcript1.getChromosome() != transcript2.getChromosome():\n+\t\t\treturn False\n+\t\tif self.colinear and transcript1.getDirection() != transcript2.getDirection():\n+\t\t\treturn False\n+\t\tif transcript1.getDistance(transcript2) > self.distance:\n+\t\t\treturn False\n+\t\treturn True\n+\n+\tdef _checkPassed(self, transcript1, transcript2):\n+\t\treturn ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))\n+\n+\tdef run(self):\n+\t\tself._sortFile()\n+\t\tif self.presorted:\n+\t\t\tself._iterate(None)\n+\t\telse:\n+\t\t\tfor chromosome in sorted(self.splittedFileNames.keys()):\n+\t\t\t\tself._iterate(chromosome)\n+\t\tself.writer.close()\n+\t\tif self.verbosity > 0:\n+\t\t\tprint "# input: %d" % (self.nbElements)\n+\t\t\tprint "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))\n+\t\t\tprint "# merges: %d" % (self.nbMerges)\n+\t\t\n+\n+if __name__ == "__main__":\n+\tdescription = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--input", dest="inputFileName", action="store",\t\t\t\t type="string", help="input file [compulsory] [format: file in transcript format given by -f]")\n+\tparser.add_option("-f", "--format", dest="format",\t\t action="store",\t\t\t\t type="string", help="format of file [format: transcript file format]")\n+\tparser.add_option("-o", "--output", dest="outputFileName", action="store",\t\t\t\t type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")\n+\tparser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff",\t\t type="string", help="output file format [format: transcript file format]")\n+\tparser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False,\t\t\t\thelp="merge colinear transcripts only [format: bool] [default: false]")\n+\tparser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")\n+\tparser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False,\t\t\t\thelp="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")\n+\tparser.add_option("-s", "--sorted", dest="sorted",\t\t action="store_true", default=False,\t\t\t\thelp="input is already sorted [format: bool] [default: false]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")\n+\t(options, args) = parser.parse_args()\n+\n+\tc = Clusterize(options.verbosity)\n+\tc.setInputFile(options.inputFileName, options.format)\n+\tc.setOutputFileName(options.outputFileName, options.outputFormat)\n+\tc.setColinear(options.colinear)\n+\tc.setDistance(options.distance)\n+\tc.setNormalize(options.normalize)\n+\tc.setPresorted(options.sorted)\n+\tc.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/clusterizeBySlidingWindows.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/clusterizeBySlidingWindows.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,344 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import re\n+from commons.core.writer.WriterChooser import WriterChooser\n+"""\n+Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.\n+"""\n+\n+import os, os.path\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc.Progress import Progress\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+\n+class ClusterizeBySlidingWindows(object):\n+\n+ def __init__(self, verbosity = 0):\n+ self.verbosity = verbosity\n+ self.strands = (0, )\n+ self.normalize = False\n+ self.plot = None\n+ self.excel = None\n+ self.outputFileName = \'\'\n+ self.defaultValue = None\n+\n+ def __del__(self):\n+ pass\n+\n+ def setInputFile(self, fileName, format):\n+ self.parser = TranscriptContainer(fileName, format, self.verbosity)\n+\n+ def setOutputFileName(self, fileName, format="gff", title="S-MART", feature="transcript", featurePart="exon"):\n+ writerChooser = WriterChooser(self.verbosity)\n+ writerChooser.findFormat(format)\n+ self.writer = writerChooser.getWriter(fileName)\n+ self.writer.setTitle(title)\n+ self.writer.setFeature(feature)\n+ self.writer.setFeaturePart(featurePart)\n+# self.outputFileName = fileName\n+# self.outputFormat = format\n+\n+ def setWindowSize(self, size):\n+ self.size = size\n+\n+ def setWindowOverlap(self, overlap):\n+ self.overlap = overlap\n+\n+ def setTag(self, tag):\n+ self.tag = tag\n+\n+ def setOperation(self, operation):\n+ self.operation = operation\n+\n+ def setBothStrands(self, bothStrands):\n+ if bothStrands:\n+ self.strands = (-1, 1)\n+\n+ def setNormalize(self, normalize):\n+ self.normalize = normalize\n+\n+ def setPlot(self, plot):\n+ self.plot = plot\n+\n+ def setExcel(self, excel):\n+ self.excel = excel\n+\n+ def setOutputTag(self, tag):\n+ self.outputTagName = tag\n+ \n+ def setDefaultValue(self, defaultValue):\n+ self.defaultValue = defaultValue\n+\n+ def checkOptions(self):\n+# if self.operation != None:\n+# raise Exception("Trying to combine the values without specifying tag! Aborting...")\n+ if self.operation != '..b'lf.excel:\n+ self.writeExcel()\n+ if self.plot:\n+ self.plotData()\n+ self.printRegions()\n+\n+\n+if __name__ == "__main__":\n+ \n+ # parse command line\n+ description = "Clusterize by Sliding Windows v1.0.1: Produces a GFF3 file that clusters a list of transcripts using a sliding window. [Category: Sliding Windows]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")\n+ parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="format of the output file [format: transcript file format]")\n+ parser.add_option("-s", "--size", dest="size", action="store", type="int", help="size of the regions [compulsory] [format: int]")\n+ parser.add_option("-e", "--overlap", dest="overlap", action="store", type="int", help="overlap between two consecutive regions [compulsory] [format: int]")\n+ parser.add_option("-m", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")\n+ parser.add_option("-g", "--tag", dest="tag", action="store", default=None, type="string", help="use a given tag as input (instead of summing number of features) [format: string]") \n+ parser.add_option("-r", "--operation", dest="operation", action="store", default=None, type="string", help="combine tag value with given operation [format: choice (sum, avg, med, min, max)]")\n+ parser.add_option("-d", "--defaultValue",dest="defaultValue", action="store", type="float", help="default value for input tag [format: float]")\n+ parser.add_option("-w", "--write", dest="writeTag", action="store", default=None, type="string", help="print the result in the given tag (default usually is \'nbElements\') [format: string]") \n+ parser.add_option("-2", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]")\n+ parser.add_option("-p", "--plot", dest="plot", action="store", default=None, type="string", help="plot regions to the given file [format: output file in PNG format]")\n+ parser.add_option("-x", "--excel", dest="excel", action="store", default=None, type="string", help="write an Excel file to the given file [format: output file in Excel format]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")\n+ (options, args) = parser.parse_args()\n+\n+ cbsw = ClusterizeBySlidingWindows(options.verbosity)\n+ cbsw.setInputFile(options.inputFileName, options.inputFormat)\n+ cbsw.setOutputFileName(options.outputFileName, options.outputFormat)\n+ cbsw.setWindowSize(options.size)\n+ cbsw.setWindowOverlap(options.overlap)\n+ cbsw.setTag(options.tag)\n+ cbsw.setDefaultValue(options.defaultValue)\n+ cbsw.setOperation(options.operation)\n+ cbsw.setOutputTag(options.writeTag)\n+ cbsw.setBothStrands(options.strands)\n+ cbsw.setPlot(options.plot)\n+ cbsw.setExcel(options.excel)\n+ cbsw.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/compareOverlapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/compareOverlapping.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,126 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Compare overlap of two transcript lists"""\n+import sys\n+import os\n+from optparse import OptionParser\n+from SMART.Java.Python.misc import Utils\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+\n+class CompareOverlapping(object):\n+\n+ def __init__(self):\n+ self._options = None\n+\n+\n+ def setAttributesFromCmdLine(self):\n+ description = "Compare Overlapping v1.0.3: Get the data which overlap with a reference set. [Category: Data Comparison]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")\n+ parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")\n+ parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]")\n+ parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]")\n+ parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")\n+ parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")\n+ parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int'..b'pping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")\n+ parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+ parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]")\n+ (self._options, args) = parser.parse_args()\n+\n+\n+ def run(self): \n+ logHandle = None\n+ if self._options.log:\n+ logHandle = open(self._options.output, "w")\n+\n+ transcriptContainer1 = TranscriptContainer(self._options.inputFileName1, self._options.format1, self._options.verbosity)\n+ transcriptContainer2 = TranscriptContainer(self._options.inputFileName2, self._options.format2, self._options.verbosity)\n+ writer = TranscriptWriter(self._options.output, "gff3", self._options.verbosity)\n+\n+ transcriptListComparator = TranscriptListsComparator(logHandle, self._options.verbosity)\n+ transcriptListComparator.restrictToStart(transcriptListComparator.QUERY, self._options.start1)\n+ transcriptListComparator.restrictToStart(transcriptListComparator.REFERENCE, self._options.start2)\n+ transcriptListComparator.restrictToEnd(transcriptListComparator.QUERY, self._options.end1)\n+ transcriptListComparator.restrictToEnd(transcriptListComparator.REFERENCE, self._options.end2)\n+ transcriptListComparator.extendFivePrime(transcriptListComparator.QUERY, self._options.fivePrime1)\n+ transcriptListComparator.extendFivePrime(transcriptListComparator.REFERENCE, self._options.fivePrime2)\n+ transcriptListComparator.extendThreePrime(transcriptListComparator.QUERY, self._options.threePrime1)\n+ transcriptListComparator.extendThreePrime(transcriptListComparator.REFERENCE, self._options.threePrime2)\n+ transcriptListComparator.acceptIntrons(transcriptListComparator.QUERY, self._options.introns)\n+ transcriptListComparator.acceptIntrons(transcriptListComparator.REFERENCE, self._options.introns)\n+ transcriptListComparator.getAntisenseOnly(self._options.antisense)\n+ transcriptListComparator.getColinearOnly(self._options.colinear)\n+ transcriptListComparator.getInvert(self._options.exclude)\n+ transcriptListComparator.setMaxDistance(self._options.distance)\n+ transcriptListComparator.setMinOverlap(self._options.minOverlap)\n+ transcriptListComparator.setPcOverlap(self._options.pcOverlap)\n+ transcriptListComparator.setIncludedOnly(self._options.included)\n+ transcriptListComparator.setIncludingOnly(self._options.including)\n+ transcriptListComparator.includeNotOverlapping(self._options.notOverlapping)\n+ transcriptListComparator.computeOdds(True)\n+ transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer1)\n+ transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, transcriptContainer2)\n+ transcriptListComparator.setOutputWriter(writer)\n+ transcriptListComparator.compareTranscriptList()\n+\n+ if self._options.log:\n+ logHandle.close()\n+\n+ if not self._options.exclude:\n+ odds = transcriptListComparator.getOdds()\n+ if self._options.verbosity > 0 and odds:\n+ print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(odds)\n+ \n+if __name__ == "__main__":\n+ icompareOverlapping = CompareOverlapping()\n+ icompareOverlapping.setAttributesFromCmdLine()\n+ icompareOverlapping.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/convertTranscriptFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/convertTranscriptFile.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,115 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a transcript file and convert it to another format +""" + +import os, re +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress + + +class ConvertTranscriptFile(object): + def __init__(self,inputFileName="", inputFormat ="", outputFileName="", outputFormat="", name="", sequenceFileName=None, strands=False, galaxy=False, feature=None, featurePart=None, verbosity=1): + self.inputFileName = inputFileName + self.inputFormat = inputFormat + self.outputFileName = outputFileName + self.outputFormat = outputFormat + self.name = name + self.sequenceFileName = sequenceFileName + self.strands = strands + self.galaxy = galaxy + + self.feature=feature + self.featurePart=featurePart + + self.verbosity = verbosity + + def setAttributesFromCmdLine(self): + description = "Convert Transcript File v1.0.3: Convert a file from a format to another. [Category: Conversion]" + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript or mapping file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in format given by -g]") + parser.add_option("-g", "--outputFormat", dest="outputFormat", action="store", type="string", help="format of the output file [compulsory] [format: transcript file format]") + parser.add_option("-n", "--name", dest="name", action="store", default="SMART", type="string", help="name for the transcripts [format: string] [default: SMART]") + parser.add_option("-s", "--sequences", dest="sequenceFileName", action="store", default=None, type="string", help="give the corresponding Multi-Fasta file (useful for EMBL format) [format: string]") + parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="consider the 2 strands as different (only useful for writing WIG files) [format: bool] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + parser.add_option("-G", "--galaxy", dest="galaxy", action="store_true", default=False, help="used for galaxy [format: bool] [default: False]") + (options, args) = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.inputFileName = options.inputFileName + self.inputFormat = options.inputFormat + self.outputFileName = options.outputFileName + self.outputFormat = options.outputFormat + self.name = options.name + self.sequenceFileName = options.sequenceFileName + self.strands = options.strands + self.galaxy = options.galaxy + self.verbosity = options.verbosity + + def run(self): + # create parser + parser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity) + # create writer + writer = TranscriptWriter(self.outputFileName, self.outputFormat, self.verbosity) + # connect parser and writer + writer.setContainer(parser) + + if self.name != None: + writer.setTitle(self.name) + if self.feature != None: + writer.setFeature(self.feature) + if self.featurePart != None: + writer.setFeaturePart(self.featurePart) + if self.sequenceFileName != None: + writer.addSequenceFile(self.sequenceFileName) + + nbItems = 0 + if self.verbosity > 0: + nbItems = parser.getNbItems() + print "%i items found" % (nbItems) + + if self.strands: + writer.setStrands(True) + # convert + writer.write() + writer.close() + +if __name__ == "__main__": + iConvertTranscriptFile = ConvertTranscriptFile() + iConvertTranscriptFile.setAttributesFromCmdLine() + iConvertTranscriptFile.run() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/coordinatesToSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/coordinatesToSequence.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,64 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Convert a list of coordinates to sequences""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Coordinates to Sequences v1.0.2: Extract the sequences from a list of coordinates. [Category: Conversion]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--sequences", dest="sequences", action="store", type="string", help="file that contains the sequences [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file (FASTA format) [format: output file in FASTA format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parser + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + sequenceParser = FastaParser(options.sequences, options.verbosity) + + writer = FastaWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + sequence = transcript.extractSequence(sequenceParser) + writer.addSequence(sequence) + progress.inc() + progress.done() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getDifference.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getDifference.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,155 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Restrict a transcript list with some parameters (regions)"""\n+\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from commons.core.parsing.FastaParser import FastaParser\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+class DifferenceGetter(object):\n+\n+ def __init__(self, verbosity):\n+ self.verbosity = verbosity\n+ self.annotationParser = None\n+ self.referenceParser = None\n+ self.sequenceParser = None\n+ self.transcriptCount = 1\n+ self.split = False\n+\n+ def createTranscript(self, chromosome, start, end):\n+ transcript = Transcript()\n+ transcript.setChromosome(chromosome)\n+ transcript.setDirection("+")\n+ transcript.setStart(start)\n+ transcript.setEnd(end)\n+ transcript.setName("region_%d" % self.transcriptCount)\n+ transcript.setTagValue("ID", "region_%d" % self.transcriptCount)\n+ self.transcriptCount += 1\n+ return transcript\n+\n+ def setSplit(self, split):\n+ self.split = split\n+\n+ def setAnnotationFile(self, fileName, format):\n+ if fileName != None:\n+ self.annotationParser = TranscriptContainer(fileName, format, self.verbosity)\n+\n+ def setReferenceFile(self, fileName, format):\n+ if fileName != None:\n+ self.referenceParser = TranscriptContainer(fileName, format, self.verbosity)\n+\n+ def setSequenceFile(self, fileName):\n+ if fileName != None:\n+ self.sequenceParser = FastaParser(fileName, self.verbosity)\n+\n+ def setOutputFile(self, fileName):\n+ self.writer = Gff3Writer(fileName, self.verbosity)\n+\n+ def initialize(self):\n+ self.presence = {}\n+ for chromosome in self.sequenceParser.getRegions():\n+ self.presence[chromosome] = [[1, self.sequenceParser.getSizeOfRegion(chromosome)]]\n+\n+ def readTranscripts(self):\n+ nbTranscripts = self.annotationParser.getNbTranscripts()\n+ progress = Progress(nbTranscripts, "Parsing annotation file" , self.verbosity)\n+ for transcript in self.annotationParser.getIterator():\n+ chromosome = transcript.getChromosome()\n+ '..b'me]):\n+ start, end = element\n+ if start <= transcript.getEnd() and transcript.getStart() <= end:\n+ toBeDeleted.append(i)\n+ if start < transcript.getStart():\n+ toBeAppended.append([start, transcript.getStart() - 1])\n+ if end > transcript.getEnd():\n+ toBeAppended.append([transcript.getEnd() + 1, end])\n+ for i in reversed(toBeDeleted):\n+ del self.presence[chromosome][i]\n+ self.presence[chromosome].extend(toBeAppended)\n+ progress.inc()\n+ progress.done()\n+\n+ def writeOutput(self):\n+ for chromosome in self.presence:\n+ for element in self.presence[chromosome]:\n+ start, end = element\n+ self.writer.addTranscript(self.createTranscript(chromosome, start, end))\n+ self.writer.write()\n+\n+ def compareToSequence(self):\n+ self.initialize()\n+ self.readTranscripts()\n+ self.writeOutput()\n+\n+ def compareToAnnotation(self):\n+ transcriptListComparator = TranscriptListsComparator(None, self.verbosity)\n+ transcriptListComparator.setSplitDifference(self.split)\n+ transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, self.annotationParser)\n+ transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.REFERENCE, self.referenceParser)\n+ transcriptListComparator.setOutputWriter(self.writer)\n+ transcriptListComparator.getDifferenceTranscriptList()\n+\n+ def run(self):\n+ if self.referenceParser != None:\n+ self.compareToAnnotation()\n+ else:\n+ self.compareToSequence()\n+\n+\n+if __name__ == "__main__":\n+ \n+ # parse command line\n+ description = "Get Difference v1.0.1: Get all the regions of the genome, except the one given or get all the elements from the first set which does not ovelap with the second set (at the nucleotide level). [Category: Data Comparison]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format [compulsory] [format: transcript file format]")\n+ parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default=None, type="string", help="reference file [format: file in transcript format given by -g]")\n+ parser.add_option("-g", "--format2", dest="format2", action="store", default=None, type="string", help="format of the reference file [format: transcript file format]")\n+ parser.add_option("-s", "--sequence", dest="sequenceFileName", action="store", default=None, type="string", help="sequence file [format: file in FASTA format]")\n+ parser.add_option("-p", "--split", dest="split", action="store_true", default=False, help="when comparing to a set of genomic coordinates, do not join [format: boolean] [default: False")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+ (options, args) = parser.parse_args()\n+\n+ getter = DifferenceGetter(options.verbosity)\n+ getter.setSplit(options.split)\n+ getter.setAnnotationFile(options.inputFileName1, options.format1)\n+ getter.setSequenceFile(options.sequenceFileName)\n+ getter.setReferenceFile(options.inputFileName2, options.format2)\n+ getter.setOutputFile(options.outputFileName)\n+ getter.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getDistance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getDistance.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,241 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Get the distance between the transcripts of two lists"""\n+\n+import os\n+import sys\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+\n+class GetDistance(object):\n+\n+ def __init__(self, verbosity = 0):\n+ self.verbosity = verbosity\n+ self.writer = None\n+ self.spearman = False\n+ self.tlc = TranscriptListsComparator(None, self.verbosity)\n+ self.strands = (0, )\n+ self.buckets = None\n+ self.title = ""\n+ self.xMin = None\n+ self.xMax = None\n+ self.proportion = False\n+ self.outputFileName = None\n+ self.keep = False\n+\n+ def __del__(self):\n+ pass\n+\n+ def setQueryFile(self, fileName, format):\n+ self.transcriptContainer1 = TranscriptContainer(fileName, format, self.verbosity)\n+ \n+ def setReferenceFile(self, fileName, format):\n+ self.transcriptContainer2 = TranscriptContainer(fileName, format, self.verbosity)\n+\n+ def setOutputFile(self, fileName):\n+ self.outputFileName = fileName\n+ \n+ def setOutputTranscriptFile(self, fileName):\n+ if fileName != None:\n+ self.writer = Gff3Writer(fileName, self.verbosity)\n+ \n+ def restrictQueryToStart(self, number):\n+ self.tlc.restrictToStart(self.tlc.QUERY, number)\n+\n+ def restrictReferenceToStart(self, number):\n+ self.tlc.restrictToStart(self.tlc.REFERENCE, number)\n+\n+ def restrictQueryToEnd(self, number):\n+ self.tlc.restrictToEnd(self.tlc.QUERY, number)\n+\n+ def restrictReferenceToEnd(self, number):\n+ self.tlc.restrictToEnd(self.tlc.REFERENCE, number)\n+\n+ def setAbsolute(self, boolean):\n+ self.tlc.setAbsolute(boolean)\n+\n+ def setProportion(self, boolean):\n+ self.proportion = boolean\n+\n+ def setColinear(self, boolean):\n+ self.tlc.getColinearOnly(boolean)\n+\n+ def setAntisense(self, boolean):\n+ self.tlc.getAntisenseOnly(boolean)\n+\n+ def setDistances(self, minDistance, maxDistance):\n+ self.tlc.setMinDistance(minDistance)\n+ self.tlc.setMaxDistance(maxDistance)\n+\n+ def setStrands(s'..b'"--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first 5\' nucleotides for list 2 [format: int]")\n+ parser.add_option("-e", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last 3\' nucleotides for list 1 [format: int]")\n+ parser.add_option("-E", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last 3\' nucleotides for list 2 [format: int]")\n+ parser.add_option("-m", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance considered between two transcripts [format: int] [default: None]")\n+ parser.add_option("-M", "--maxDistance", dest="maxDistance", action="store", default=1000, type="int", help="maximum distance considered between two transcripts [format: int] [default: 1000]")\n+ parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store_true", default=False, help="consider the elements from list 1 which are upstream of elements of list 2 [format: bool] [default: False]")\n+ parser.add_option("-3", "--threePrime", dest="threePrime", action="store_true", default=False, help="consider the elements from list 1 which are downstream of elements of list 2 [format: bool] [default: False]")\n+ parser.add_option("-u", "--buckets", dest="buckets", action="store", default=None, type="int", help="plot histogram instead of line plot with given interval size [format: int] [default: None]")\n+ parser.add_option("-2", "--2strands", dest="twoStrands", action="store_true", default=False, help="plot the distributions of each strand separately [format: bool] [default: False]")\n+ parser.add_option("-r", "--spearman", dest="spearman", action="store_true", default=False, help="compute Spearman rho [format: bool] [default: False]")\n+ parser.add_option("-x", "--xMin", dest="xMin", action="store", default=None, type="int", help="minimum value on the x-axis to plot [format: int] [default: None]")\n+ parser.add_option("-X", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int] [default: None]")\n+ parser.add_option("-t", "--title", dest="title", action="store", default=None, type="string", help="title for the graph [format: int] [default: None]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+ parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: bool]")\n+ (options, args) = parser.parse_args()\n+\n+ gd = GetDistance(options.verbosity)\n+ gd.setQueryFile(options.inputFileName1, options.format1)\n+ gd.setReferenceFile(options.inputFileName2, options.format2)\n+ gd.setOutputFile(options.outputFileName)\n+ gd.setOutputTranscriptFile(options.outputDistances)\n+ gd.setColinear(options.colinear)\n+ gd.setAntisense(options.antisense)\n+ gd.setAbsolute(options.absolute)\n+ gd.setProportion(options.proportion)\n+ gd.restrictQueryToStart(options.start1)\n+ gd.restrictReferenceToStart(options.start2)\n+ gd.restrictQueryToEnd(options.end1)\n+ gd.restrictReferenceToEnd(options.end2)\n+ gd.setDistances(options.minDistance, options.maxDistance)\n+ gd.setUpstream(options.fivePrime)\n+ gd.setDownstream(options.threePrime)\n+ gd.setStrands(options.twoStrands)\n+ gd.setBuckets(options.buckets)\n+ gd.setTitle(options.title)\n+ gd.setXValues(options.xMin, options.xMax)\n+ gd.keepTmpValues(options.keep)\n+ gd.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getDistribution.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,291 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Get the repartition of some elements in a chromosomes"""\n+\n+import os\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc.Progress import Progress\n+from math import *\n+\n+def divideKeyDict(dictionary, ratio):\n+ return dict([(key / ratio, dictionary[key]) for key in dictionary])\n+\n+\n+def setTranscript(chromosome, direction, start, end, name, value):\n+ transcript = Transcript()\n+ transcript.setChromosome(chromosome)\n+ transcript.setDirection(direction)\n+ transcript.setStart(start)\n+ transcript.setEnd(end)\n+ transcript.setName(name)\n+ transcript.setTagValue("nbElements", value)\n+ return transcript\n+\n+\n+\n+if __name__ == "__main__":\n+ \n+ magnifyingFactor = 1000\n+ \n+ # parse command line\n+ description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")\n+ parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]")\n+ parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]")\n+ parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]")\n+ parser.add_option("-w", "--raw", dest="raw", '..b' plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio))\n+ if options.raw:\n+ plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio))\n+ else:\n+ plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio))\n+ else:\n+ if options.raw:\n+ plotter.addLine(divideKeyDict(bins[chromosome], ratio))\n+ else:\n+ plotter.addLine(divideKeyDict(density[chromosome], ratio))\n+ plotter.plot()\n+ \n+ if options.csv:\n+ outputFileName = "%s" % (options.outputFileName)\n+ if options.chromosome != None:\n+ outputFileName += "_%s" % (options.chromosome)\n+ if options.start != None and options.end != None:\n+ outputFileName += ":%d-%d" % (options.start, options.end)\n+ outputFileName += ".csv"\n+ csvHandle = open(outputFileName, "w")\n+ for slice in range(start / sliceSize, maxSlice + 1):\n+ csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize))\n+ csvHandle.write("\\n")\n+ if options.bothStrands:\n+ for chromosome in densityPlus:\n+ if len(densityPlus[chromosome]) > 0:\n+ csvHandle.write("%s [+]" % (chromosome))\n+ for slice in sorted(densityPlus[chromosome].keys()):\n+ csvHandle.write(";%.2f" % (densityPlus[chromosome][slice]))\n+ csvHandle.write("\\n") \n+ if len(densityMinus[chromosome]) > 0:\n+ csvHandle.write("%s [-]" % (chromosome))\n+ for slice in sorted(densityPlus[chromosome].keys()):\n+ csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice]))\n+ csvHandle.write("\\n") \n+ else:\n+ for chromosome in density:\n+ if len(density[chromosome]) > 0:\n+ csvHandle.write(chromosome)\n+ for slice in sorted(density[chromosome].keys()):\n+ csvHandle.write(";%.2f" % (density[chromosome][slice]))\n+ csvHandle.write("\\n")\n+ csvHandle.close()\n+ \n+ if options.gff:\n+ chromosome = "" if options.chromosome == None else options.chromosome.capitalize()\n+ start = "" if options.start == None else "%d" % (options.start)\n+ end = "" if options.end == None else "%d" % (options.end)\n+ link1 = "" if options.start == None and options.end == None else ":"\n+ link2 = "" if options.start == None and options.end == None else "-"\n+ writer = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity)\n+ cpt = 1\n+ if options.raw:\n+ valuesPlus = binsPlus\n+ valuesMinus = binsMinus\n+ values = bins\n+ else:\n+ valuesPlus = densityPlus\n+ valuesMinus = densityMinus\n+ values = density\n+ if options.bothStrands:\n+ for chromosome in values:\n+ for slice in valuesPlus[chromosome]:\n+ writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice]))\n+ cpt += 1\n+ for slice in valuesMinus[chromosome]:\n+ writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice]))\n+ cpt += 1\n+ else:\n+ for chromosome in values:\n+ for slice in values[chromosome]:\n+ writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice]))\n+ cpt += 1\n+ writer.write()\n+\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getExons.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getExons.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,128 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +zeroBaseToOneBaseConvertor = (lambda x: x - 1 if x > 0 else x) + +class GetExons(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.selection = False + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setSelection(self, selection): + if selection == None: + return + self.selection = True + self.selectionItems = [] + self.selectionIntervals = [] + for part in selection.split(","): + try: + splittedPart = map(int, part.split("..")) + except Exception: + raise Exception("Elements '" + splittedPart + "' of selection '" + selection + "' do no seem to be integers!") + if len(splittedPart) == 1: + self.selectionItems.append(splittedPart[0]) + elif len(splittedPart) == 2: + self.selectionIntervals.append((splittedPart[0], splittedPart[1])) + else: + raise Exception("Cannot parse elements '" + splittedPart + "' of selection '" + selection + "'!") + + def getSelectionExonIndices(self, nbExons): + if not self.selection: + return range(nbExons) + indices = [] + for item in self.selectionItems: + indices.append(range(nbExons)[zeroBaseToOneBaseConvertor(item)]) + for start, end in self.selectionIntervals: + start, end = map(zeroBaseToOneBaseConvertor, (start, end)) + if end > 0: + end += 1 + indices.extend(range(nbExons)[start:end]) + return indices + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbExons = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + selectedExons = self.getSelectionExonIndices(transcript.getNbExons()) + transcript.sortExons() + for cpt2, exon in enumerate(transcript.getExons()): + if cpt2 not in selectedExons: + continue + exonTranscript = Transcript() + exonTranscript.copy(exon) + if "Parent" in exonTranscript.tags: + del exonTranscript.tags["Parent"] + exonTranscript.tags["feature"] = "transcript" + if "ID" not in exonTranscript.tags or exonTranscript.tags["ID"] == "unnamed transcript": + exonTranscript.tags["ID"] = "exon_%d-%d" % (cpt1+1, cpt2+1) + if exonTranscript.getName() == "unnamed transcript": + exonTranscript.setName("exon_%d-%d" % (cpt1+1, cpt2+1)) + self.writer.addTranscript(exonTranscript) + nbExons += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d exons written" % (nbExons) + +if __name__ == "__main__": + + description = "Get Exons v1.0.1: Get the exons of a set of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--select", dest="select", action="store", default=None, type="string", help="select some of the exons (like '1,2,5..-3,-1') [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + ge = GetExons(options.verbosity) + ge.setInputFile(options.inputFileName, options.format) + ge.setSelection(options.select) + ge.setOutputFile(options.outputFileName) + ge.run() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getIntrons.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getIntrons.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,89 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class GetIntrons(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + + def setInputFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def run(self): + progress = Progress(self.parser.getNbTranscripts(), "Reading input file", self.verbosity) + nbIntrons = 0 + for cpt1, transcript in enumerate(self.parser.getIterator()): + for cpt2, intron in enumerate(transcript.getIntrons()): + intronTranscript = Transcript() + intronTranscript.copy(intron) + if "Parent" in intronTranscript.tags: + del intronTranscript.tags["Parent"] + intronTranscript.tags["feature"] = "transcript" + if "ID" not in intronTranscript.tags or intronTranscript.tags["ID"] == "unnamed transcript": + intronTranscript.tags["ID"] = "intron_%d-%d" % (cpt1+1, cpt2+1) + if intronTranscript.getName() == "unnamed transcript": + intronTranscript.setName("intron_%d-%d" % (cpt1+1, cpt2+1)) + self.writer.addTranscript(intronTranscript) + nbIntrons += 1 + progress.inc() + self.writer.write() + self.writer.close() + progress.done() + if self.verbosity > 1: + print "%d transcripts read" % (self.parser.getNbTranscripts()) + print "%d introns written" % (nbIntrons) + + +if __name__ == "__main__": + + description = "Get Introns v1.0.1: Get the introns of a set of transcripts. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + gi = GetIntrons(options.verbosity) + gi.setInputFile(options.inputFileName, options.format) + gi.setOutputFile(options.outputFileName) + gi.run() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getLetterDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getLetterDistribution.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,153 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the size distribution of a Fasta / BED file""" + +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * +from commons.core.parsing.ParserChooser import ParserChooser + + +def writeCVSfile(outHandler): + for pos in range(len(letters)): + posTrue = pos +1 + outHandler.write( "%s;" % (posTrue)) + for letter in lettersRate: + if positionRate[letter].has_key(pos): + outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%")) + else: + outHandler.write("%s=0%s;" % (letter, "%")) + outHandler.write("\n") + +if __name__ == "__main__": + + # parse command line + description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: sequence file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-c", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool] [default: false]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + chooser = ParserChooser() + chooser.findFormat(options.format) + parser = chooser.getParser(options.inputFileName) + nbSequences = parser.getNbSequences() + print "%i sequences read" % (nbSequences) + + # treat items + progress = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity) + nbLettersTotal = 0 + nbLetters = {} + lettersRate = {} + nbPositions = {} + positionCount = {} + positionRate = {} + nbPositionRate = {} + for sequence in parser.getIterator(): + letters = sequence.getSequence() + thisNbLettersTotal = sequence.getSize() + nbLettersTotal += thisNbLettersTotal + thisNbLetters = {} + + for pos in range(len(letters)): + letter = letters[pos] + if letter not in thisNbLetters: + thisNbLetters[letter] = 1 + else: + thisNbLetters[letter] += 1 + if pos+1 not in nbPositions: + nbPositions[pos+1] = 1 + else: + nbPositions[pos+1] += 1 + if letter not in positionCount: + positionCount[letter] = {} + if pos+1 not in positionCount[letter]: + positionCount[letter][pos+1] = 1 + else: + positionCount[letter][pos+1] += 1 + + for letter in thisNbLetters: + if letter not in nbLetters: + nbLetters[letter] = thisNbLetters[letter] + else: + nbLetters[letter] += thisNbLetters[letter] + if letter not in lettersRate: + lettersRate[letter] = {} + rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100) + if rate not in lettersRate[letter]: + lettersRate[letter][rate] = 1 + else: + lettersRate[letter][rate] += 1 + progress.inc() + progress.done() + + for letter in positionCount: + positionRate[letter] = {} + for pos in positionCount[letter]: + positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100 + for pos in nbPositions: + nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100 + + # plot content distributions + plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True) + plotter.setFill(0) + plotter.setLegend(True) + for letter in lettersRate: + plotter.addLine(lettersRate[letter], letter) + plotter.plot() + + # plot distribution per position + plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True) + plotter.setFill(0) + plotter.setLegend(True) + plotter.setXLabel("Position on the read") + plotter.setYLabel("Percentage") + for letter in positionRate: + plotter.addLine(positionRate[letter], letter) + plotter.addLine(nbPositionRate, "#") + plotter.plot() + + if options.csv: + outHandler = open("%s.csv" % (options.outputFileName), "w") + writeCVSfile(outHandler) + outHandler.close() + + print "%d sequences" % (nbSequences) + print "%d letters" % (nbLettersTotal) + for letter in nbLetters: + print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getReadDistribution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getReadDistribution.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,129 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + + +if __name__ == "__main__": + + # parse command line + description = "Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file sequence [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="keep the best n [format: int]") + parser.add_option("-p", "--percent", dest="percent", action="store", default=None, type="float", help="keep the best n\% [format: float]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output files in PNG format and txt format]") + parser.add_option("-x", "--xMax", dest="xMax", action="store", default=None, type="int", help="maximum value on the x-axis to plot [format: int]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + if options.working_Dir[-1] != '/': + options.outputFileName = options.working_Dir + '/' + options.outputFileName + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + else: + raise Exception("Do not understand '%s' file format." % (options.format)) + + progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity) + sequences = {} + for sequence in parser.getIterator(): + sequence = sequence.sequence + if sequence not in sequences: + sequences[sequence] = 1 + else: + sequences[sequence] += 1 + progress.inc() + progress.done() + + values = sequences.values() + values.sort() + if options.percent != None: + threshold = values[int(float(options.percent) / 100 * len(values))] + elif options.number != None: + threshold = values[-options.number] + else: + threshold = 0 + + # sort by value + progress = Progress(parser.getNbSequences(), "Sorting values", options.verbosity) + sortedValues = dict([(value, []) for value in sequences.values()]) + for sequence, value in sequences.iteritems(): + sortedValues[value].append(sequence) + progress.inc() + progress.done() + + outputFileName = "%s.txt" % (options.outputFileName) + handle = open(outputFileName, "w") + progress = Progress(parser.getNbSequences(), "Writing into %s" % (outputFileName), options.verbosity) + for value in reversed(sorted(sortedValues.keys())): + if value >= threshold: + for sequence in sortedValues[value]: + handle.write("%s\t%d\n" % (sequence, value)) + progress.inc() + progress.done() + handle.close() + + line = {} + progress = Progress(len(values), "Preparing plot", options.verbosity) + for value in values: + if value not in line: + line[value] = 1 + else: + line[value] += 1 + progress.inc() + progress.done() + + plot = RPlotter("%s.png" % (options.outputFileName), options.verbosity) + plot.setFill(0) + plot.setMaximumX(options.xMax) + plot.setXLabel("# occurrences") + plot.setYLabel("# reads") + plot.addLine(line) + plot.plot() + + if options.verbosity > 0: + print "%d/%.2f/%.1f/%d occurrences" % (Utils.getMinAvgMedMax(line)) + + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getSizes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getSizes.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,218 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os, sys\n+from optparse import OptionParser\n+from commons.core.parsing.FastaParser import FastaParser\n+from commons.core.parsing.FastqParser import FastqParser\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from commons.core.parsing.GffParser import GffParser\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc import Utils\n+\n+from commons.core.LoggerFactory import LoggerFactory\n+from commons.core.utils.RepetOptionParser import RepetOptionParser\n+\n+LOG_DEPTH = "smart"\n+\n+class GetSizes(object):\n+\t\n+\tdef __init__(self, inFileName = None, inFormat=None, outFileName = None, query=None,xMax=None, xMin=None, verbosity = 0):\n+\t\tself.inFileName = inFileName\n+\t\tself.inFormat= inFormat\n+\t\tself.outFileName = outFileName\n+\t\tself.query = query\n+\t\tself.xMax = xMax\n+\t\tself.xMin = xMin\n+\t\tself.xLab = "Size"\n+\t\tself.yLab = "# reads"\n+\t\tself.barplot = False\n+\t\tself._verbosity = verbosity\n+\t\tself.parser = None\n+\t\tself._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)\n+\t\t\n+\tdef setAttributesFromCmdLine(self):\n+\t\tdescription = "Usage: getSizes.py [options]\\n\\nGet Sizes v1.0.2: Get the sizes of a set of genomic coordinates. [Category: Visualization]\\n"\n+\t\tepilog = ""\n+\t\tparser = RepetOptionParser(description = description, epilog = epilog)\n+\t\tparser.add_option("-i", "--input",\t dest="inputFileName", action="store",\t default=None,\t type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]")\n+\t\tparser.add_option("-f", "--format",\tdest="format",\t\t action="store",\t default=None,\t type="string", help="format of the input [compulsory] [format: transcript or sequence file format]")\n+\t\tparser.add_option("-q", "--query",\t dest="query",\t\t action="store",\t default=None,\t type="string", help="type to mesure [default: size] [format: choice (size, intron size, exon size, 1st exon size)]")\t \n+\t\tparser.add_option("-o", "--output",\tdest="outputFileName", action="store",\t default=None,\t type="string", help="output file [format: output file in PNG format]")\n+\t\tparser.add_option("-x", "--xMax",\t dest="xMax",\t\t action="store",\t default=None,\t type="int",\t help="maximum value on the x-axis to plot [format: int]")\n+\t\tparser.add_option("-X", "--xMin",\t dest="xMin",\t\t action="store",\t default=None,\t '..b'="store_true", default=False,\t\t\t\t\t help="use barplot representation [format: bool] [default: false]") \n+\t\toptions = parser.parse_args()[0]\n+\t\tself._setAttributesFromOptions(options)\n+\t\t\n+\tdef _setAttributesFromOptions(self, options):\n+\t\tself.setInFileName(options.inputFileName)\n+\t\tself.setInFormat(options.format)\n+\t\tself.setQuery(options.query)\n+\t\tself.setOutFileName(options.outputFileName)\n+\t\tself.setXMax(options.xMax)\n+\t\tself.setXMin(options.xMin)\n+\t\tself.setxLab(options.xLab)\n+\t\tself.setyLab(options.yLab)\n+\t\tself.setBarplot(options.barplot)\n+\t\tself.setVerbosity(options.verbosity)\n+\t\t\n+\tdef setInFileName(self, inputFileName):\n+\t\tself.inFileName = inputFileName\n+\t\t\n+\tdef setInFormat(self, inFormat):\n+\t\tself.inFormat = inFormat\n+\t\n+\tdef setQuery(self, query):\n+\t\tself.query = query\n+\t\t\n+\tdef setOutFileName(self, outFileName):\n+\t\tself.outFileName = outFileName\n+\t\n+\tdef setXMax(self, xMax):\n+\t\tself.xMax = xMax\n+\t\t\n+\tdef setXMin(self, xMin):\n+\t\tself.xMin = xMin\n+\t\n+\tdef setxLab(self, xLab):\n+\t\tself.xLab = xLab\n+\t\t\n+\tdef setyLab(self, yLab):\n+\t\tself.yLab = yLab\n+\t\t\n+\tdef setBarplot(self, barplot):\n+\t\tself.barplot = barplot\n+\t\t\n+\tdef setVerbosity(self, verbosity):\n+\t\tself._verbosity = verbosity\n+\t\t\n+\tdef _checkOptions(self):\n+\t\tif self.inFileName == None:\n+\t\t\tself._logAndRaise("ERROR: Missing input file name")\n+\t\tif self.inFormat == "fasta":\n+\t\t\tself.parser = FastaParser(self.inFileName, self._verbosity)\n+\t\telif self.inFormat == "fastq":\n+\t\t\tself.parser = FastqParser(self.inFileName, self._verbosity)\n+\t\telse:\n+\t\t\tself.parser = TranscriptContainer(self.inFileName, self.inFormat, self._verbosity)\n+\t\t\t\n+\tdef _logAndRaise(self, errorMsg):\n+\t\tself._log.error(errorMsg)\n+\t\traise Exception(errorMsg)\n+\n+\tdef run(self):\n+\t\tLoggerFactory.setLevel(self._log, self._verbosity)\n+\t\tself._checkOptions()\n+\t\tself._log.info("START getsizes")\n+\t\tself._log.debug("Input file name: %s" % self.inFileName)\n+\n+\t\tnbItems = self.parser.getNbItems()\n+\t\tself._log.info( "%i items found" % (nbItems))\n+\t\t\n+\t\t# treat items\n+\t\tprogress = Progress(nbItems, "Analyzing sequences of %s" % (self.inFileName), self._verbosity)\n+\t\tsizes = {}\n+\t\tminimum\t = 1000000000000\n+\t\tmaximum\t = 0\n+\t\tsum\t\t = 0\n+\t\tnumber = 0\n+\t\tnbSubItems = 0\n+\t\tfor item in self.parser.getIterator():\n+\t\t\titems = []\n+\t\t\tif self.query == "exon":\n+\t\t\t\titems = item.getExons()\n+\t\t\telif self.query == "exon1":\n+\t\t\t\tif len(item.getExons()) > 1:\n+\t\t\t\t\titem.sortExons()\n+\t\t\t\t\titems = [item.getExons()[0]]\n+\t\t\telif self.query == "intron":\n+\t\t\t\titems = item.getIntrons()\n+\t\t\telse:\n+\t\t\t\titems = [item, ]\n+\t\n+\t\t\tfor thisItem in items:\n+\t\t\t\ttry:\n+\t\t\t\t\tnbElements = int(float(thisItem.getTagValue("nbElements")))\n+\t\t\t\t\tif nbElements == None:\n+\t\t\t\t\t\tnbElements = 1\n+\t\t\t\texcept:\n+\t\t\t\t\tnbElements = 1\n+\t\t\t\tsize\t= thisItem.getSize()\n+\t\t\t\tminimum = min(minimum, size)\n+\t\t\t\tmaximum = max(maximum, size)\n+\t\t\t\t\n+\t\t\t\tif size not in sizes:\n+\t\t\t\t\tsizes[size] = nbElements\n+\t\t\t\telse:\n+\t\t\t\t\tsizes[size] += nbElements\n+\t\t\t\tsum\t\t+= size\n+\t\t\t\tnbSubItems += nbElements\n+\t\t\tnumber += 1\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\n+\t\tif self.outFileName != None:\n+\t\t\tplotter = RPlotter(self.outFileName, self._verbosity)\n+\t\t\tplotter.setFill(0)\n+\t\t\tplotter.setMinimumX(self.xMin)\n+\t\t\tplotter.setMaximumX(self.xMax)\n+\t\t\tplotter.setXLabel(self.xLab)\n+\t\t\tplotter.setYLabel(self.yLab)\n+\t\t\tplotter.setBarplot(self.barplot)\n+\t\t\tplotter.addLine(sizes)\n+\t\t\tplotter.plot()\n+\t\t\t\n+\t\tif nbSubItems == 0:\n+\t\t\tself._logAndRaise("No item found")\n+\t\t\t\n+\t\tself.items = number\t \n+\t\tself.subItems = nbSubItems\n+\t\tself.nucleotides = sum\n+\t\tself.minAvgMedMax = Utils.getMinAvgMedMax(sizes)\n+\t\t\t\t \n+\t\tprint "%d items" % (number)\n+\t\tprint "%d sub-items" % (nbSubItems)\n+\t\tprint "%d nucleotides" % (sum)\n+\t\tprint "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(sizes)\n+\n+\t\tself._log.info("END getsizes")\n+\n+\n+if __name__ == "__main__":\n+\tiGetSizes = GetSizes()\n+\tiGetSizes.setAttributesFromCmdLine()\n+\tiGetSizes.run()\n+\t\n+#TODO: add two more options!!!!!!\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getWigData.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getWigData.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,67 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Data v1.0.1: Compute the average data for some genomic coordinates using WIG files (thus covering a large proportion of the genome) and update a tag. [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-t", "--tag", dest="tag", action="store", type="string", help="choose a tag name to write the wig information to output file [compulsory] [format: file in WIG format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parsers and writers + transcriptParser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + wigParser = WigParser(options.wig) + writer = Gff3Writer(options.outputFileName, options.verbosity) + wigParser.setStrands(options.strands) + + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptParser.getIterator(): + values = transcript.extractWigData(wigParser) + if options.strands: + values = values[transcript.getDirection()] + transcript.setTagValue(options.tag, str(float(sum(values)) / len(values))) + writer.addTranscript(transcript) + progress.inc() + progress.done() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getWigDistance.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getWigDistance.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,105 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. +""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.WigParser import WigParser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter + + +if __name__ == "__main__": + + # parse command line + description = "Get WIG Data v1.0.2: Compute the average data around some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat", dest="inputFormat", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-w", "--wig", dest="wig", action="store", type="string", help="wig file name [compulsory] [format: file in WIG format]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=1000, type="int", help="distance around position [compulsory] [format: int] [default: 1000]") + parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider both strands separately [format: boolean] [default: False]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-a", "--default", dest="defaultValue", action="store", default=0.0, type="float", help="default value (when value is NA) [default: 0.0] [format: float]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="use log scale for y-axis [format: boolean] [default: False]") + parser.add_option("-k", "--keep", dest="keep", action="store_true", default=False, help="keep temporary files [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + # create parsers and writers + transcriptParser = TranscriptContainer(options.inputFileName, options.inputFormat, options.verbosity) + wigParser = WigParser(options.wig) + wigParser.setStrands(options.strands) + wigParser.setDefaultValue(options.defaultValue) + + # allocate data + strands = (1, -1) if options.strands else (1, ) + values = {} + for strand in strands: + values[strand] = dict([(i, 0.0) for i in range(-options.distance, options.distance+1)]) + + # read transcripts + progress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + for transcript in transcriptParser.getIterator(): + transcript.removeExons() + transcript.restrictStart(2) + transcript.extendStart(options.distance) + transcript.extendEnd(options.distance-1) + theseValues = transcript.extractWigData(wigParser) + if len(strands) == 1: + theseValues = {1: theseValues} + for strand in strands: + if len(theseValues[strand]) < 2 * options.distance + 1: + theseValues[strand] = [options.defaultValue] * (2 * options.distance + 1 - len(theseValues[strand])) + theseValues[strand] + if len(theseValues[strand]) != 2 * options.distance + 1: + raise Exception("Got something wrong with the size of the WIG data concerning %s: %d found instead of %d" % (transcript, len(theseValues[strand]), 2 * options.distance + 1)) + for i in range(-options.distance, options.distance+1): + values[strand][i] += theseValues[strand][i + options.distance] + progress.inc() + progress.done() + + for strand in strands: + for i in range(-options.distance, options.distance+1): + values[strand][i] /= transcriptParser.getNbTranscripts() * strand + + # draw plot + plotter = RPlotter(options.outputFileName, options.verbosity, options.keep) + plotter.setXLabel("Distance") + plotter.setYLabel("WigValue") + for strand in strands: + plotter.addLine(values[strand]) + if options.log: + plotter.setLog("y") + plotter.plot() + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/getWigProfile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/getWigProfile.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,160 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""\n+Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks.\n+"""\n+\n+import math\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from commons.core.parsing.WigParser import WigParser\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+\n+class GetWigProfile(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.verbosity\t= verbosity\n+\t\tself.values\t\t = {}\n+\t\tself.defaultValue = 0.0\n+\n+\tdef _iToJ(self, i, size):\n+\t\treturn min(self.nbPoints+1, int(math.floor(float(i - self.distance) / (size) * (self.nbPoints))))\n+\n+\tdef readTranscripts(self):\n+\t\tself.strandNames = (1, -1) if self.strands else (1, )\n+\t\tself.values\t\t= dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames])\n+\t\ttranscriptParser = TranscriptContainer(self.inputFileName, self.inputFormat, self.verbosity)\n+\t\twigParser\t\t= WigParser(self.wig)\n+\t\tnbValues\t\t = dict([(strand, dict([(i, 0.0) for i in range(self.nbPoints + 2 * self.distance)])) for strand in self.strandNames])\n+\t\twigParser.setStrands(self.strands)\n+\t\twigParser.setDefaultValue(self.defaultValue)\n+\n+\t\tprogress = Progress(transcriptParser.getNbTranscripts(), "Parsing %s" % (self.inputFileName), self.verbosity)\n+\t\tfor transcript in transcriptParser.getIterator():\n+\t\t\ttranscriptSize = transcript.getSize()\n+\t\t\texpectedSize = transcriptSize + 2 * self.distance\n+\t\t\ttranscript.extendStart(self.distance)\n+\t\t\ttranscript.extendEnd(self.distance)\n+\t\t\ttheseValues = transcript.extractWigData(wigParser)\n+\n+\t\t\tif len(self.strandNames) == 1:\n+\t\t\t\ttheseValues = {1: theseValues}\n+\t\t\tfor strand in self.strandNames:\n+\t\t\t\tif len(theseValues[strand]) < expectedSize:\n+\t\t\t\t\ttheseValues[strand] = [self.defaultValue] * (expectedSize - len(theseValues[strand])) + theseValues[strand]\n+\t\t\t\tif len(theseValues[strand]) != expectedSize:\n+\t\t\t\t\traise Exception("Got something wrong with the size of the WIG data concerning %s [%s]: %d found instead of %d" % (transcript, ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in transcript.getExons()]), len(theseValues[strand]), expectedSize))\n+\t\t\t\tfivePValues = theseValues[strand][: self.distance]\n+\t\t\t\tnbValues = [0.0] * (self.nbPoints)\n+\t\t\t\ttranscriptValues = [0.0] * (self.nbPoints)\n+\t\t\t\tfor i in range(self.distance, len(theseValues[stra'..b'+\t\t\t\tstrand = 1\n+\t\t\tfor i in range(self.nbPoints + 2 * self.distance):\n+\t\t\t\tself.values[strand][i] /= transcriptParser.getNbTranscripts() * strand\n+\n+\n+\tdef smoothen(self):\n+\t\tif self.smoothenForce == None:\n+\t\t\treturn\n+\t\tfor strand in self.strandNames:\n+\t\t\taverageValues = {}\n+\t\t\tfor center in range(self.distance, self.distance + self.nbPoints):\n+\t\t\t\tsum\t\t= 0.0\n+\t\t\t\tnbValues = 0.0\n+\t\t\t\tfor i in range(center - self.smoothenForce + 1, center + self.smoothenForce):\n+\t\t\t\t\tif i > self.distance and i < self.distance + self.nbPoints:\n+\t\t\t\t\t\tnbValues += 1\n+\t\t\t\t\t\tsum\t\t+= self.values[strand][i]\n+\t\t\t\taverageValues[center] = sum / nbValues\n+\t\t\tfor position in range(self.distance, self.distance + self.nbPoints):\n+\t\t\t\tself.values[strand][position] = averageValues[position]\n+\t\t\n+\n+\tdef plot(self):\n+\t\tplotter = RPlotter(self.outputFileName, self.verbosity)\n+\t\tfor strand in self.strandNames:\n+\t\t\tplotter.addLine(self.values[strand])\n+\t\tif self.log:\n+\t\t\tplotter.setLog("y")\n+\t\tplotter.setAxisLabel("x", {0: -self.distance, self.distance: "start", self.distance+self.nbPoints-1: "end", 2*self.distance+self.nbPoints-1: self.distance})\n+\t\tplotter.plot()\n+\n+\n+\n+if __name__ == "__main__":\n+\t\n+\t# parse command line\n+\tdescription = "Get WIG Profile v1.0.1: Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome). [Category: WIG Tools]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--input",\t\t\t dest="inputFileName",\taction="store",\t\t\t\t\t\t\t\t\t\t\ttype="string", help="input file [compulsory] [format: file in transcript format given by -f]")\n+\tparser.add_option("-f", "--inputFormat", dest="inputFormat",\t\taction="store",\t\t\t\t\t\t\t\t\t\t\ttype="string", help="format of the input file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-w", "--wig",\t\t\t\t dest="wig",\t\t\t\t\t\taction="store",\t\t\t\t\t\t\t\t\t\t\ttype="string", help="wig file name [compulsory] [format: file in WIG format]")\t\n+\tparser.add_option("-p", "--nbPoints",\t\t dest="nbPoints",\t\t\t\taction="store",\t\t\t default=1000,\ttype="int",\t\t help="number of points on the x-axis [compulsory] [format: int] [default: 1000]")\t\n+\tparser.add_option("-d", "--distance",\t\t dest="distance",\t\t\t\taction="store",\t\t\t default=0,\t\t\ttype="int",\t\t help="distance around genomic coordinates [compulsory] [format: int] [default: 0]")\t\n+\tparser.add_option("-s", "--strands",\t\t dest="strands",\t\t\t\taction="store_true", default=False,\t\t\t\t\t\t\t\t help="consider both strands separately [format: boolean] [default: False]")\t\n+\tparser.add_option("-m", "--smoothen",\t\t dest="smoothen",\t\t\t\taction="store",\t\t\t default=None,\ttype="int",\t\t help="smoothen the curve [format: int] [default: None]")\t\n+\tparser.add_option("-a", "--default",\t\t dest="defaultValue",\t action="store",\t\t\t default=0.0,\t type="float",\thelp="default value (when value is NA) [default: 0.0] [format: float]")\n+\tparser.add_option("-o", "--output",\t\t\t dest="outputFileName", action="store",\t\t\t\t\t\t\t\t\t\t\ttype="string", help="output file [compulsory] [format: output file in PNG format]")\n+\tparser.add_option("-l", "--log",\t\t\t\t dest="log",\t\t\t\t\t\taction="store_true", default=False,\t\t\t\t\t\t\t\t help="use log scale for y-axis\t[format: boolean] [default: False]")\n+\tparser.add_option("-v", "--verbosity",\t dest="verbosity",\t\t\taction="store",\t\t\t default=1,\t\t\ttype="int",\t\t help="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\twigProfile\t\t\t\t\t\t\t\t= GetWigProfile(options.verbosity)\n+\twigProfile.strands\t\t\t \t= options.strands\n+\twigProfile.inputFileName\t= options.inputFileName\n+\twigProfile.inputFormat\t\t= options.inputFormat\n+\twigProfile.wig\t\t\t\t\t\t= options.wig\n+\twigProfile.nbPoints\t\t\t\t= options.nbPoints\n+\twigProfile.distance\t\t\t\t= options.distance\n+\twigProfile.smoothenForce\t= options.smoothen\n+\twigProfile.defaultValue\t = options.defaultValue\n+\twigProfile.outputFileName = options.outputFileName\n+\twigProfile.log\t\t\t\t\t\t= options.log\n+\n+\twigProfile.readTranscripts()\n+\twigProfile.smoothen()\n+\twigProfile.plot()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mapperAnalyzer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mapperAnalyzer.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,486 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""\n+Read a mapping file (many formats supported) and select some of them\n+Mappings should be sorted by read names\n+"""\n+import os, random, shelve\n+from optparse import OptionParser, OptionGroup\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.parsing.FastaParser import FastaParser\n+from commons.core.parsing.FastqParser import FastqParser\n+from commons.core.parsing.GffParser import GffParser\n+from commons.core.writer.BedWriter import BedWriter\n+from commons.core.writer.UcscWriter import UcscWriter\n+from commons.core.writer.GbWriter import GbWriter\n+from commons.core.writer.Gff2Writer import Gff2Writer\n+from commons.core.writer.Gff3Writer import Gff3Writer\n+from commons.core.writer.FastaWriter import FastaWriter\n+from commons.core.writer.FastqWriter import FastqWriter\n+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter\n+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection\n+from SMART.Java.Python.mySql.MySqlTable import MySqlTable\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+\n+\n+distanceExons = 20\n+exonSize = 20\n+\n+\n+class MapperAnalyzer(object):\n+ """\n+ Analyse the output of a parser\n+ """\n+\n+ def __init__(self, verbosity = 0):\n+ self.verbosity = verbosity\n+ self.mySqlConnection = MySqlConnection(verbosity)\n+ self.tooShort = 0\n+ self.tooManyMismatches = 0\n+ self.tooManyGaps = 0\n+ self.tooShortExons = 0\n+ self.tooManyMappings = 0\n+ self.nbMappings = 0\n+ self.nbSequences = 0\n+ self.nbAlreadyMapped = 0\n+ self.nbAlreadyMappedSequences = 0\n+ self.nbWrittenMappings = 0\n+ self.nbWrittenSequences = 0\n+ self.parser = None\n+ self.logHandle = None\n+ self.randomNumber = random.randint(0, 100000)\n+ self.gff3Writer = None\n+ self.alreadyMappedReader = None\n+ self.unmatchedWriter = None\n+ self.sequenceListParser = None\n+ self.sequences = None\n+ self.alreadyMapped = None\n+ self.mappedNamesTable = None\n+ s'..b'up.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")\n+ otheGroup.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]")\n+ parser.add_option_group(compGroup)\n+ parser.add_option_group(filtGroup)\n+ parser.add_option_group(tranGroup)\n+ parser.add_option_group(outpGroup)\n+ parser.add_option_group(otheGroup)\n+ (options, args) = parser.parse_args()\n+\n+ \n+ analyzer = MapperAnalyzer(options.verbosity)\n+ analyzer.setMappingFile(options.inputFileName, options.format)\n+ analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat)\n+ analyzer.setOutputFile(options.outputFileName, options.title)\n+ if options.appendFileName != None:\n+ analyzer.setAlreadyMatched(options.appendFileName)\n+ if options.remaining:\n+ analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat)\n+ if options.number != None:\n+ analyzer.setMaxMappings(options.number)\n+ if options.size != None:\n+ analyzer.setMinSize(options.size)\n+ if options.identity != None:\n+ analyzer.setMinId(options.identity)\n+ if options.mismatch != None:\n+ analyzer.setMaxMismatches(options.mismatch)\n+ if options.gap != None:\n+ analyzer.setMaxGaps(options.gap)\n+ if options.mergeExons:\n+ analyzer.mergeExons(True)\n+ if options.removeExons:\n+ analyzer.acceptShortExons(False)\n+ if options.log:\n+ analyzer.setLog("%s.log" % (options.outputFileName))\n+ analyzer.analyze()\n+ \n+ if options.verbosity > 0:\n+ print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)\n+ if options.appendFileName != None:\n+ print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)\n+ print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100)\n+ if options.appendFileName != None:\n+ print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100)\n+ print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100)\n+ print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100)\n+ print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100)\n+ print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100)\n+ print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100)\n+ print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)\n+ if options.appendFileName != None:\n+ print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)\n+\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mappingToCoordinates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mappingToCoordinates.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,91 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + + +"""Convert files with some mapping format to coordinates format""" + +import os +from optparse import OptionParser +from commons.core.parsing.PslParser import PslParser +from commons.core.parsing.AxtParser import AxtParser +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress + + +class MappingToCoordinates(object): + def __init__(self,verbosity=1, inputFileName=None, format = None, output=None,galaxy = False, title="S-MART"): + self.verbosity = verbosity + self.inputFileName = inputFileName + self.format = format + self.output = output + self.galaxy = galaxy + self.title = title + + def setAttributesFromCmdLine(self): + description = "Mapping To Coordinates v1.0.1: Convert a set of mappings (given by a mapping tool) to a set of transcripts. [Category: Conversion]" + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: mapping file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-G", "--galaxy", dest="galaxy", action="store_true", default=False, help="used for galaxy [format: bool] [default: False]") + (options, args) = parser.parse_args() + + self.verbosity = options.verbosity + self.inputFileName = options.inputFileName + self.format = options.format + self.output = options.output + self.galaxy = options.galaxy + + def run(self): + if self.verbosity > 0: + print "Reading input file..." + parser = TranscriptContainer(self.inputFileName, self.format, self.verbosity) + if self.verbosity > 0: + print "... done" + writer = Gff3Writer(self.output, self.verbosity, self.title) + + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (self.inputFileName), self.verbosity) + for transcript in parser.getIterator(): + writer.addTranscript(transcript) + progress.inc() + progress.done() + + if self.galaxy: + os.rename("%s.gff3" % (self.output), self.output) + +if __name__ == '__main__': + launcher = MappingToCoordinates() + launcher.setAttributesFromCmdLine() + launcher.run() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mergeSlidingWindowsClusters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mergeSlidingWindowsClusters.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,144 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Merge sliding windows of two different clusterings +""" + +import sys +import re +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.structure.Transcript import Transcript + +class MergeSlidingWindowsClusters(object): + """ + Merge the ouptput of several sets of sliding windows + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.inputs = [] + self.outputData = {} + self.nbData = 0 + self.nbWrittenData = 0 + self.chromosomes = [] + self.writer = None + + def __del__(self): + if self.writer != None: + self.writer.close() + + def addInput(self, fileName, fileFormat): + self.inputs.append(TranscriptContainer(fileName, fileFormat, self.verbosity)) + self.chromosomes = list(set(self.chromosomes).union(set(self.inputs[-1].getChromosomes()))) + + def setOutput(self, fileName): + self.writer = Gff3Writer(fileName, self.verbosity) + + def readInput(self, i, chromosome): + progress = Progress(self.inputs[i].getNbTranscripts(), "Reading file #%d -- chromosome %s" % (i+1, chromosome), self.verbosity) + for transcript in self.inputs[i].getIterator(): + progress.inc() + if chromosome != transcript.getChromosome(): continue + start = transcript.getStart() + end = transcript.getEnd() + direction = transcript.getDirection() + tags = transcript.tags + if chromosome not in self.outputData: + self.outputData[chromosome] = {} + if direction not in self.outputData[chromosome]: + self.outputData[chromosome][direction] = {} + if start not in self.outputData[chromosome][direction]: + self.outputData[chromosome][direction][start] = {} + if end in self.outputData[chromosome][direction][start]: + ends = self.outputData[chromosome][direction][start].keys() + if ends[0] != end: + sys.exit("Error! Two regions starting at %d end are not consistent (%d and %d) in %s on strand %d" % (start, end, ends[0], chromosome, direction)) + self.outputData[chromosome][direction][start][end].update(tags) + else: + self.outputData[chromosome][direction][start][end] = tags + self.nbData += 1 + progress.done() + + + def writeOutput(self, chromosome): + progress = Progress(self.nbData - self.nbWrittenData, "Writing output for chromosome %s" % (chromosome), self.verbosity) + for direction in self.outputData[chromosome]: + for start in self.outputData[chromosome][direction]: + for end in self.outputData[chromosome][direction][start]: + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setDirection(direction) + transcript.tags = self.outputData[chromosome][direction][start][end] + transcript.setName("region_%d" % (self.nbWrittenData + 1)) + tags = transcript.getTagNames() + for tag in tags: + if tag.startswith("Name_") or tag.startswith("ID_"): + del transcript.tags[tag] + self.nbWrittenData += 1 + self.writer.addTranscript(transcript) + progress.inc() + self.writer.write() + progress.done() + self.outputData = {} + + def merge(self): + for chromosome in self.chromosomes: + for i, input in enumerate(self.inputs): + self.readInput(i, chromosome) + self.writeOutput(chromosome) + self.writer.close() + + +if __name__ == "__main__": + + # parse command line + description = "Merge Sliding Windows Clusters v1.0.2: Merge two files containing the results of a sliding windows clustering. [Category: Sliding Windows]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of the input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of the input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + merger = MergeSlidingWindowsClusters(options.verbosity) + merger.addInput(options.inputFileName1, options.inputFormat1) + merger.addInput(options.inputFileName2, options.inputFormat2) + merger.setOutput(options.outputFileName) + merger.merge() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mergeTranscriptLists.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mergeTranscriptLists.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,174 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Merge elements of two transcript lists with some condition"""\n+\n+import os, random, shutil, glob\n+from optparse import OptionParser\n+from commons.core.parsing.SequenceListParser import SequenceListParser\n+from commons.core.parsing.BedParser import BedParser\n+from commons.core.parsing.GffParser import GffParser\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+\n+\n+class MergeLists(object):\n+\n+ def __init__(self, verbosity):\n+ self.verbosity = verbosity\n+ self.seed = random.randint(0, 100000)\n+ self.aggregation = False\n+ self.normalization = False\n+ self.distance = False\n+ self.antisense = False\n+ self.colinear = False\n+ self.fileNames = {}\n+ self.formats = {}\n+ self.tmpFileNames = []\n+ self.logHandle = None\n+\n+# def __del__(self):\n+# for fileNameRoot in self.tmpFileNames:\n+# for fileName in glob.glob("%s*" % (fileNameRoot)):\n+# os.remove(fileName)\n+# if self.logHandle != None:\n+# self.logHandle.close()\n+# self.logHandle = None\n+\n+ def setLogFileName(self, fileName):\n+ self.logHandle = open(fileName, "w")\n+\n+ def setInputFileName(self, fileName, format, id):\n+ self.fileNames[id] = fileName\n+ self.formats[id] = format\n+\n+ def setOutputFileName(self, fileName):\n+ self.outputFileName = fileName\n+\n+ def setAggregate(self, aggregation):\n+ self.aggregation = aggregation\n+\n+ def setNormalization(self, normalization):\n+ self.normalization = normalization\n+\n+ def setDistance(self, distance):\n+ self.distance = distance\n+\n+ def setAntisense(self, antisense):\n+ self.antisense = antisense\n+\n+ def setColinear(self, colinear):\n+ self.colinear = colinear\n+\n+ def createTmpFileName(self, root):\n+ fileName = "tmp_%s_%d.gff3" % (root, self.seed)\n+ self.tmpFileNames.append(fileName)\n+ return fileName\n+\n+ def selfMerge(self, fileName, format, outputFileName):\n+ transcriptListComparator = TranscriptListsComparator(self.logHandle,'..b'nscriptListComparator.compareTranscriptList()\n+\n+ def mergeFiles(self, fileName1, fileName2, outputFileName):\n+ outputFile = open(outputFileName, "w")\n+ shutil.copyfileobj(open(fileName1, "r"), outputFile)\n+ shutil.copyfileobj(open(fileName2, "r"), outputFile)\n+ outputFile.close()\n+\n+ def run(self):\n+ selectedFileQuery = self.createTmpFileName("query")\n+ self.keepOverlapping({0: self.fileNames[0], 1: self.fileNames[0]}, {0: "gff3", 1: "gff3"}, selectedFileQuery)\n+ mergeFileTarget = self.createTmpFileName("target")\n+ self.selfMerge(self.fileNames[1], self.formats[1], mergeFileTarget)\n+ if not self.aggregation:\n+ overlapFile = self.createTmpFileName("overlap")\n+ self.keepOverlapping({0: mergeFileTarget, 1: selectedFileQuery}, {0: "gff3", 1: "gff3"}, overlapFile)\n+ mergeFileTarget = overlapFile\n+ mergeFileMerged = self.createTmpFileName("merged")\n+ self.mergeFiles(mergeFileTarget, selectedFileQuery, mergeFileMerged)\n+ self.selfMerge(mergeFileMerged, "gff3", self.outputFileName)\n+\n+\n+\n+if __name__ == "__main__":\n+ \n+ # parse command line\n+ description = "Merge Lists v1.0.3: Merge the elements of two lists of genomic coordinates. [Category: Merge]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")\n+ parser.add_option("-j", "--input2", dest="inputFileName2", action="store", default=None, type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")\n+ parser.add_option("-g", "--format2", dest="format2", action="store", default=None, type="string", help="format of file 2 [compulsory] [format: file in transcript format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]")\n+ parser.add_option("-k", "--all", dest="all", action="store_true", default=False, help="print all the transcripts, not only those overlapping [format: bool] [default: false]")\n+ parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts [format: int] [default: 0]")\n+ parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]")\n+ parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]")\n+ parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+ (options, args) = parser.parse_args()\n+\n+# ml = MergeLists(logHandle, options.verbosity)\n+ \n+ ml = MergeLists(0)\n+ ml.setInputFileName(options.inputFileName1, options.format1, 0)\n+ ml.setInputFileName(options.inputFileName2, options.format2, 1)\n+ ml.setOutputFileName(options.outputFileName)\n+ ml.setAntisense(options.antisense)\n+ ml.setColinear(options.colinear)\n+ ml.setAggregate(options.all)\n+ ml.setNormalization(options.normalize)\n+ ml.setDistance(options.distance)\n+ ml.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/misc/MultipleRPlotter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/MultipleRPlotter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,160 @@ +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import subprocess +import random +import math +from SMART.Java.Python.misc.RPlotter import RPlotter + +NBCOLORS = 9 + +""" +Plot multiple curves with RPlotter +""" + +class MultipleRPlotter(object): + """ + Plot some curves + @ivar fileName: name of the file + @type fileName: string + @ivar height: height of the file + @type height: int + @ivar width: width of the file + @type width: int + @ivar plots: plots to be included + @type plots: list of L{RPlotter{RPlotter}} + @ivar keep: keep script lines + @type keep: boolean + @ivar format: format of the file + @type format: string + """ + + def __init__(self, fileName, verbosity = 0, keep = False): + """ + Constructor + @param fileName: name of the file to produce + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + @param keep: keep temporary files + @type keep: boolean + """ + self.fileName = fileName + self.verbosity = verbosity + self.keep = keep + self.format = "png" + self.width = 1000 + self.height = 500 + self.plots = [] + self.scriptFileName = "tmpScript-%d.R" % (os.getpid()) + + def __del__(self): + """ + Destructor + Remove script files + """ + if not self.keep: + if os.path.exists(self.scriptFileName): + os.remove(self.scriptFileName) + outputFileName = "%sout" % (self.scriptFileName) + if os.path.exists(outputFileName): + os.remove(outputFileName) + + def setFormat(self, format): + """ + Set the format of the picture + @param format: the format + @type format: string + """ + if format not in ("png", "pdf", "jpeg", "bmp", "tiff"): + raise Exception("Format '%s' is not supported by RPlotter" % (format)) + self.format = format + + + def setWidth(self, width): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + """ + self.width = width + + + def setHeight(self, height): + """ + Set the dimensions of the image produced + @param height: heigth of the image + @type height: int + """ + self.height = height + + + def setImageSize(self, width, height): + """ + Set the dimensions of the image produced + @param width: width of the image + @type width: int + @param height: heigth of the image + @type height: int + """ + self.width = width + self.height = height + + def addPlot(self, plot): + """ + Add a plot + @param plots: plot to be included + @type plots: L{RPlotter{RPlotter}} + """ + self.plots.append(plot) + + def plot(self): + """ + Plot the figures + """ + scriptHandle = open(self.scriptFileName, "w") + scriptHandle.write("library(RColorBrewer)\n") + scriptHandle.write("colorPanel = brewer.pal(n=%d, name=\"Set1\")\n" % (NBCOLORS)) + scriptHandle.write("%s(%s = \"%s\", width = %d, height = %d, bg = \"white\")\n" % (self.format, "filename" if self.format != "pdf" else "file", self.fileName, self.width, self.height)) + scriptHandle.write("par(mfrow=c(%d, 1))\n" % (len(self.plots))) + for plot in self.plots: + scriptHandle.write(plot.getScript()) + scriptHandle.write("dev.off()\n") + scriptHandle.close() + rCommand = "R" + if "SMARTRPATH" in os.environ: + rCommand = os.environ["SMARTRPATH"] + command = "\"%s\" CMD BATCH %s" % (rCommand, self.scriptFileName) + status = subprocess.call(command, shell=True) + if status != 0: + self.keep = True + raise Exception("Problem with the execution of script file %s, status is: %s" % (self.scriptFileName, status)) + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/misc/Progress.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/Progress.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,93 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import time + +class Progress(object): + """Show the progress of a process""" + + def __init__(self, aim, message = "Progress", verbosity = 0): + self.aim = aim + self.progress = 0 + self.message = message + self.length = -1 + self.verbosity = verbosity + self.maxMessageSize = 50 + self.barSize = 80 + self.startTime = time.time() + self.elapsed = 0 + if len(self.message) > self.maxMessageSize: + self.message = self.message[0:self.maxMessageSize-3] + "..." + self.show() + + + def inc(self): + self.progress += 1 + self.show() + + + def getPrintableElapsedTime(self, time): + timeHou = int(time) / 3600 + timeMin = int(time) / 60 - 60 * timeHou + timeSec = int(time) % 60 + if timeHou > 0: + return "%3dh %2dm" % (timeHou, timeMin) + if timeMin > 0: + return "%2dm %2ds" % (timeMin, timeSec) + return "%2ds " % (timeSec) + + + def show(self): + if self.verbosity <= 0: + return + if self.aim == 0: + return + messageSize = len(self.message) + length = int(self.progress / float(self.aim) * self.barSize) + elapsed = int(time.time() - self.startTime) + if (length > self.length) or (elapsed > self.elapsed + 10): + self.length = length + self.elapsed = elapsed + string = "%s%s[%s%s] %d/%d" % (self.message, " " * max(0, self.maxMessageSize - messageSize), "=" * self.length, " " * (self.barSize - self.length), self.progress, self.aim) + if elapsed > 5: + done = float(self.progress) / self.aim + total = elapsed / done + remaining = total - elapsed + string += " ETA: %s " % (self.getPrintableElapsedTime(remaining)) + string += "\r" + sys.stdout.write(string) + sys.stdout.flush() + + + def done(self): + if self.verbosity > 0: + messageSize = len(self.message) + elapsed = time.time() - self.startTime + print "%s%s[%s] %d completed in %s " % (self.message, " " * max(0, self.maxMessageSize - messageSize), "=" * self.barSize, self.aim, self.getPrintableElapsedTime(elapsed)) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/misc/RPlotter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/RPlotter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,821 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+\n+import os\n+import subprocess\n+import random\n+import math\n+\n+minPositiveValue = 10e-6\n+\n+"""\n+Plot simple curves in R\n+"""\n+\n+class RPlotter(object):\n+ """\n+ Plot some curves\n+ @ivar nbColors: number of different colors\n+ @type nbColors: int\n+ @ivar fileName: name of the file\n+ @type fileName: string\n+ @ivar lines: lines to be plotted\n+ @type lines: array of dict\n+ @ivar names: name of the lines\n+ @type names: array of strings\n+ @ivar colors: color of the lines\n+ @type colors: array of strings\n+ @ivar types: type of the lines (plain or dashed)\n+ @type types: array of strings\n+ @ivar format: format of the picture\n+ @type format: string\n+ @ivar lineWidth: width of the line in a xy-plot\n+ @type lineWidth: int\n+ @ivar xMin: minimum value taken on the x-axis\n+ @type xMin: int\n+ @ivar xMax: maximum value taken on the x-axis\n+ @type xMax: int\n+ @ivar yMin: minimum value taken on the y-axis\n+ @type yMin: int\n+ @ivar yMax: maximum value taken on the y-axis\n+ @type yMax: int\n+ @ivar minimumX: minimum value allowed on the x-axis\n+ @type minimumX: int\n+ @ivar maximumX: maximum value allowed on the x-axis\n+ @type maximumX: int\n+ @ivar minimumY: minimum value allowed on the y-axis\n+ @type minimumY: int\n+ @ivar maximumY: maximum value allowed on the y-axis\n+ @type maximumY: int\n+ @ivar leftMargin: add some margin in the left part of the plot\n+ @type leftMargin: float\n+ @ivar rightMargin: add some margin in the right part of the plot\n+ @type rightMargin: float\n+ @ivar downMargin: add some margin at the top of the plot\n+ @type downMargin: float\n+ @ivar upMargin: add some margin at the bottom of the plot\n+ @type upMargin: float\n+ @ivar logX: use log scale on the x-axis\n+ @type logX: boolean\n+ @ivar logY: use log scale on the y-axis\n+ @type logY: boolean\n+ @ivar logZ: use log scale on the z-axis (the color)\n+ @type logZ: boolean\n+ @ival fill: if a value is not given, fill it with given value\n+ @type fill: int\n+ @ival bucket: cluster the data into buckets of given size\n+ @type bucket: int\n+ @ival seed: a random number\n+ @type seed: int\n+ @ival regression: plot a linear regression\n+ @type regression: boolean\n+ @ival legend: set the legend\n+ @type legend: boolean\n+ @ival legendBySide: set the legend outside of the plot\n+ @type legendBySde: boolean\n+ @ival xLabel: l'..b' lwd = %d, cex = 1.5, ncol = 1, bg = \\"white\\")\\n" % (self.lineWidth)\n+\n+ return script\n+ \n+\n+\n+ def plot(self):\n+ """\n+ Plot the lines\n+ """\n+ scriptFileName = "tmpScript-%d.R" % (self.seed)\n+ scriptHandle = open(scriptFileName, "w")\n+ scriptHandle.write("library(RColorBrewer)\\n")\n+ scriptHandle.write("colorPanel = brewer.pal(n=%d, name=\\"Set1\\")\\n" % (self.nbColors))\n+ scriptHandle.write("%s(%s = \\"%s\\", width = %d, height = %d, bg = \\"white\\")\\n" % (self.format, "filename" if self.format != "pdf" else "file", self.fileName, self.width, self.height))\n+ scriptHandle.write(self.getScript())\n+ scriptHandle.write("dev.off()\\n")\n+ scriptHandle.close()\n+ rCommand = "R"\n+ if "SMARTRPATH" in os.environ:\n+ rCommand = os.environ["SMARTRPATH"]\n+ command = "\\"%s\\" CMD BATCH %s" % (rCommand, scriptFileName)\n+ status = subprocess.call(command, shell=True)\n+\n+ if status != 0:\n+ self.keep = True\n+ raise Exception("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))\n+ \n+\n+ def getCorrelationData(self):\n+ if not self.regression:\n+ return ""\n+ scriptFileName = "tmpScript-%d.R" % (self.seed)\n+ rScript = open(scriptFileName, "w")\n+ rScript.write("data = scan(\\"tmpData-%d-0.dat\\", list(x = -0.000000, y = -0.000000))\\n" % (self.seed))\n+ x = "log10(data$x)" if self.logX else "data$x"\n+ y = "log10(data$y)" if self.logY else "data$y"\n+ rScript.write("summary(lm(%s ~ %s))\\n" % (y, x))\n+ rScript.close()\n+ rCommand = "R"\n+ if "SMARTRPATH" in os.environ:\n+ rCommand = os.environ["SMARTRPATH"]\n+ command = "\\"%s\\" CMD BATCH %s" % (rCommand, scriptFileName)\n+ status = subprocess.call(command, shell=True)\n+ if status != 0:\n+ self.keep = True\n+ raise Exception("Problem with the execution of script file %s computing the correlation, status is: %s" % (scriptFileName, status))\n+ outputRFile = open("%sout" % (scriptFileName))\n+ output = ""\n+ start = False\n+ end = False\n+ for line in outputRFile:\n+ if start and "> " in line:\n+ end = True\n+ if start and not end:\n+ output += line\n+ if "summary" in line:\n+ start = True\n+ return output\n+\n+\n+ def getSpearmanRho(self):\n+ """\n+ Get the Spearman rho correlation using R\n+ """\n+ return None\n+ if not self.points and not self.barplot and not self.heatPoints:\n+ raise Exception("Cannot compute Spearman rho correlation whereas not in \'points\' or \'bar\' mode.")\n+ \n+ scriptFileName = "tmpScript-%d.R" % (self.seed)\n+ rScript = open(scriptFileName, "w")\n+ rScript.write("library(Hmisc)\\n")\n+ rScript.write("data = scan(\\"tmpData-%d-0.dat\\", list(x = -0.000000, y = -0.000000))\\n" % (self.seed))\n+ rScript.write("spearman(data$x, data$y)\\n")\n+ rScript.close()\n+\n+ rCommand = "R"\n+ if "SMARTRPATH" in os.environ:\n+ rCommand = os.environ["SMARTRPATH"]\n+ command = "\\"%s\\" CMD BATCH %s" % (rCommand, scriptFileName)\n+ status = subprocess.call(command, shell=True)\n+\n+ if status != 0:\n+ self.keep = True\n+ raise Exception("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))\n+\n+ outputRFile = open("%sout" % (scriptFileName))\n+ nextLine = False\n+ for line in outputRFile:\n+ line = line.strip()\n+ if nextLine:\n+ if line == "NA":\n+ return None\n+ return float(line)\n+ nextLine = False\n+ if line == "rho":\n+ nextLine = True\n+\n+ return None\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/misc/UnlimitedProgress.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/UnlimitedProgress.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,81 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import time + +class UnlimitedProgress(object): + """Show the progress of a process when no upper bound is known""" + + def __init__(self, step = 1000, message = "Progress", verbosity = 0): + self.step = step + self.progress = 0 + self.message = message + self.verbosity = verbosity + self.maxMessageSize = 50 + self.startTime = time.time() + self.elapsed = 0 + if len(self.message) > self.maxMessageSize: + self.message = self.message[0:self.maxMessageSize-3] + "..." + self.show() + + + def inc(self): + self.progress += 1 + self.show() + + + def getPrintableElapsedTime(self, time): + timeHou = int(time) / 3600 + timeMin = int(time) / 60 - 60 * timeHou + timeSec = int(time) % 60 + if timeHou > 0: + return "%3dh %2dm" % (timeHou, timeMin) + if timeMin > 0: + return "%2dm %2ds" % (timeMin, timeSec) + return "%2ds" % (timeSec) + + + def show(self): + if self.verbosity <= 0: + return + elapsed = int(time.time() - self.startTime) + if (self.progress % self.step == 0) or (elapsed > self.elapsed + 10): + self.elapsed = elapsed + string = "%s %d -- time spent: %s\r" % (self.message, self.progress, self.getPrintableElapsedTime(elapsed)) + sys.stdout.write(string) + sys.stdout.flush() + + + def done(self): + if self.verbosity > 0: + elapsed = time.time() - self.startTime + string = "%s %d -- time spent: %s\r" % (self.message, self.progress, self.getPrintableElapsedTime(elapsed)) + print string + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/misc/Utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/misc/Utils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,271 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""Some useful functions"""\n+\n+import sys, os\n+import random\n+import subprocess\n+\n+\n+def writeFile(fileName, content):\n+ """\n+ Write the content of a file\n+ """\n+ handle = open(fileName, "w")\n+ handle.write(content)\n+ handle.close()\n+\n+def sumOfLists(list1, list2):\n+ """\n+ Element by element sum\n+ """\n+ if len(list1) != len(list2):\n+ sys.exit("Cannot sum list whose sizes are different!")\n+ return [list1[i] + list2[i] for i in range(len(list1))]\n+\n+\n+def protectBackslashes(string):\n+ """\n+ Protect the backslashes in a path by adding another backslash\n+ """\n+ return string.replace("\\\\", "\\\\\\\\")\n+ \n+\n+def getHammingDistance(string1, string2):\n+ """\n+ Compute Hamming distance between two strings\n+ """\n+ if len(string1) != len(string2):\n+ raise Exception("Error, size of %s and %s differ" % (string1, string2))\n+ return sum(ch1 != ch2 for ch1, ch2 in zip(string1, string2))\n+\n+\n+def getLevenshteinDistance(string1, string2):\n+ """\n+ Compute Levenshtein distance between two strings\n+ """\n+ if len(string1) < len(string2):\n+ return getLevenshteinDistance(string2, string1)\n+ if not string1:\n+ return len(string2)\n+ previousRow = xrange(len(string2) + 1)\n+ for i, c1 in enumerate(string1):\n+ currentRow = [i + 1]\n+ for j, c2 in enumerate(string2):\n+ insertions = previousRow[j + 1] + 1\n+ deletions = currentRow[j] + 1\n+ substitutions = previousRow[j] + (c1 != c2)\n+ currentRow.append(min(insertions, deletions, substitutions))\n+ previousRow = currentRow\n+ return previousRow[-1]\n+\n+\n+def getMinAvgMedMax(values):\n+ """\n+ Get some stats about a dict\n+ @param values: a distribution (the value being the number of occurrences of the key)\n+ @type values: dict int to int\n+ @return: a tuple\n+ """\n+ minValues = min(values.keys())\n+ maxValues = max(values.keys())\n+ sumValues = sum([value * values[value] for value in values])\n+ nbValues = sum(values.values())\n+ allValues = []\n+ for key in values:\n+ for i in range(values[key]):\n+ allValues.append(key)\n+ sortedValues = sorted(allValues)\n+ sorted(values.values())\n+ if (nbValues % 2 == 0):\n+ medValues = (sortedValues[nbValues / 2 - 1] + sortedValues[nbValues / 2]) / 2.0\n+ else:\n+ medValues = sortedValues[(nbValues + 1) / 2 - 1]\n+ return (minValues, float('..b'les differ (%d != %d)" % (len(lines1), len(lines2))\n+ return False\n+ for i in xrange(len(lines1)):\n+ if lines1[i] != lines2[i]:\n+ print "Line %d differ (\'%s\' != \'%s\')" % (i, lines1[i].strip(), lines2[i].strip())\n+ return False\n+ return True\n+\n+\n+def binomialCoefficient(a, b):\n+ """\n+ Compute cumulated product from a to b\n+ @param a: a value\n+ @type a: int\n+ @param b: a value\n+ @type b: int\n+ """\n+ if a > b / 2:\n+ a = b-a\n+ p = 1.0\n+ for i in range(b-a+1, b+1):\n+ p *= i\n+ q = 1.0\n+ for i in range(1, a+1):\n+ q *= i\n+ return p / q\n+\n+\n+memory = {}\n+\n+# def fisherExactPValue(a, b, c, d):\n+# """\n+# P-value of Fisher exact test for 2x2 contingency table\n+# """\n+# if (a, b, c, d) in memory:\n+# return memory[(a, b, c, d)]\n+\n+# n = a + b + c + d\n+# i1 = binomialCoefficient(a, a+b)\n+# i2 = binomialCoefficient(c, a+c)\n+# i3 = binomialCoefficient(c+d, n)\n+# pValue = i1 * i2 / i3\n+\n+# memory[(a, b, c, d)] = pValue\n+\n+# return pValue\n+ \n+\n+def fisherExactPValue(a, b, c, d):\n+ if (a, b, c, d) in memory:\n+ return memory[(a, b, c, d)]\n+\n+ scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000))\n+ rScript = open(scriptFileName, "w")\n+ rScript.write("data = matrix(c(%d, %d, %d, %d), nr=2)\\n" % (a, b, c, d))\n+ rScript.write("fisher.test(data)\\n")\n+ #rScript.write("chisq.test(data)\\n")\n+ rScript.close()\n+\n+ rCommand = "R"\n+ if "SMARTRPATH" in os.environ:\n+ rCommand = os.environ["SMARTRPATH"]\n+ command = "\\"%s\\" CMD BATCH %s" % (rCommand, scriptFileName)\n+ status = subprocess.call(command, shell=True)\n+\n+ if status != 0:\n+ sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))\n+\n+ outputRFileName = "%sout" % (scriptFileName)\n+ outputRFile = open(outputRFileName)\n+ pValue = None\n+ pValueTag = "p-value "\n+ for line in outputRFile:\n+ line = line.strip()\n+ if line == "": continue\n+ for splittedLine in line.split(","):\n+ splittedLine = splittedLine.strip()\n+ if splittedLine.startswith(pValueTag):\n+ pValue = float(splittedLine.split()[-1])\n+ break\n+\n+ if pValue == None:\n+ sys.exit("Problem with the cannot find p-value! File %s, values are: %d, %d, %d, %d" % (scriptFileName, a, b, c, d))\n+\n+ os.remove(scriptFileName)\n+ os.remove(outputRFileName)\n+\n+ memory[(a, b, c, d)] = pValue\n+\n+ return pValue\n+\n+\n+def fisherExactPValueBulk(list):\n+\n+ scriptFileName = "tmpScript-%d.R" % (random.randint(0, 10000))\n+ rScript = open(scriptFileName, "w")\n+ for element in list:\n+ rScript.write("fisher.test(matrix(c(%d, %d, %d, %d), nr=2))$p.value\\n" % (int(element[0]), int(element[1]), int(element[2]), int(element[3])))\n+ rScript.close()\n+\n+ rCommand = "R"\n+ if "SMARTRPATH" in os.environ:\n+ rCommand = os.environ["SMARTRPATH"]\n+ command = "\\"%s\\" CMD BATCH %s" % (rCommand, scriptFileName)\n+ status = subprocess.call(command, shell=True)\n+\n+ if status != 0:\n+ sys.exit("Problem with the execution of script file %s, status is: %s" % (scriptFileName, status))\n+\n+ outputRFileName = "%sout" % (scriptFileName)\n+ outputRFile = open(outputRFileName)\n+ pValue = None\n+ pValueTag = "[1] "\n+ results = {}\n+ cpt = 0\n+ for line in outputRFile:\n+ line = line.strip()\n+ if line == "": continue\n+ if line.startswith(pValueTag):\n+ pValue = float(line.split()[-1])\n+ results[list[cpt][0:2]] = pValue\n+ cpt += 1\n+\n+ if pValue == None:\n+ sys.exit("Problem with the cannot find p-value!")\n+ if cpt != len(list):\n+ sys.exit("Error in the number of p-values computed by R in file \'%s\'!" % (scriptFileName))\n+\n+ os.remove(scriptFileName)\n+ os.remove(outputRFileName)\n+\n+ return results\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/modifyFasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/modifyFasta.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,62 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the content of a FASTA file""" + +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.writer.FastaWriter import FastaWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Sequence List v1.0.1: Extend or shring a list of sequences. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in FASTA format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="keep first nucleotides [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="keep last nucleotides [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + progress = Progress(parser.getNbSequences(), "Reading file %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + if options.start != None: + sequence.shrinkToFirstNucleotides(options.start) + if options.end != None: + sequence.shrinkToLastNucleotides(options.end) + writer.addSequence(sequence) + progress.inc() + progress.done() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/modifyGenomicCoordinates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/modifyGenomicCoordinates.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,80 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the genomic coordinates of a file""" + +from optparse import OptionParser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Genomic Coordinates v1.0.1: Extend or shrink a list of genomic coordinates. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="restrict to the start of the transcript [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="restrict to the end of the transcript [format: int]") + parser.add_option("-5", "--fivePrime", dest="fivePrime", action="store", default=None, type="int", help="extend to the 5' direction [format: int]") + parser.add_option("-3", "--threePrime", dest="threePrime", action="store", default=None, type="int", help="extend to the 3' direction [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + + writer = TranscriptWriter(options.outputFileName, "gff3", options.verbosity) + + nbItems = 0 + nbItems = parser.getNbItems() + print "%i items found" % (nbItems) + + progress = Progress(nbItems, "Analyzing sequences of " + options.inputFileName, options.verbosity) + for transcript in parser.getIterator(): + if options.start != None: + transcript.restrictStart(options.start) + if options.end != None: + transcript.restrictEnd(options.end) + if options.fivePrime != None: + transcript.extendStart(options.fivePrime) + if options.threePrime != None: + transcript.extendEnd(options.threePrime) + + writer.addTranscript(transcript) + + progress.inc() + progress.done() + + writer.write() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/modifySequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/modifySequenceList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,72 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Modify the content of a FASTA file""" +import sys +from optparse import OptionParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Modify Sequence List v1.0.1: Extend or shring a list of sequences. [Category: Data Modification]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName",action="store", type="string", help="input file [compulsory] [format: file in format given by -f]") + parser.add_option("-o", "--output", dest="outputFileName", action="store",default=None, type="string", help="output file [compulsory] [format: output file in format given by -f]") + parser.add_option("-f", "--format", dest="format",action="store",type="string", help="format of the file [compulsory] [format: sequence file format]") + parser.add_option("-s", "--start", dest="start", action="store", default=None,type="int",help="keep first nucleotides [format: int]") + parser.add_option("-e", "--end", dest="end", action="store",default=None,type="int",help="keep last nucleotides [format: int]") + parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1,type="int",help="trace level [format: int]") + (options, args) = parser.parse_args() + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + sys.exit("Do not understand '%s' file format." % (options.format)) + + progress = Progress(parser.getNbSequences(), "Reading file %s" % (options.inputFileName), options.verbosity) + for sequence in parser.getIterator(): + if options.start != None: + sequence.shrinkToFirstNucleotides(options.start) + if options.end != None: + sequence.shrinkToLastNucleotides(options.end) + writer.addSequence(sequence) + progress.inc() + progress.done() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mySql/MySqlConnection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlConnection.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,137 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +#! /usr/bin/env python +import os +import random +import sqlite3 +from SMART.Java.Python.mySql.MySqlQuery import MySqlQuery + + +class MySqlConnection(object): + """Connection to a database""" + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.databaseName = os.path.join(os.environ.get("SMARTTMPPATH", "."), "smartdb%d" % random.randint(0, 100000)) + self.connection = sqlite3.connect(self.databaseName) + self.executeQuery("PRAGMA journal_mode = OFF") + self.executeQuery("PRAGMA synchronous = 0") + self.executeQuery("PRAGMA locking_mode = EXCLUSIVE") + self.executeQuery("PRAGMA count_change = OFF") + self.executeQuery("PRAGMA temp_store = 2") + + def __del__(self): + self.connection.close() + + + def createDatabase(self): + pass + + + def deleteDatabase(self): + if os.path.exists(self.databaseName): + os.remove(self.databaseName) + + + def executeQuery(self, command, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + result = query.execute(command, insertion) + self.connection.commit() + except: + result = query.execute(command, insertion) + self.connection.commit() + if insertion: + return result + else: + return query + + + def executeManyQueries(self, commands): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for cpt, command in enumerate(commands): + query.execute(command) + self.connection.commit() + except: + for cpt, command in enumerate(commands): + query.execute(command) + self.connection.commit() + + + def executeManyFormattedQueries(self, command, lines, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + for line in lines: + result = query.executeFormat(command, line) + self.connection.commit() + if insertion: + return result + else: + return query + + + def executeManyQueriesIterator(self, table): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for command in table.getIterator(): + query.execute(command) + self.connection.commit() + except: + for command in table.getIterator(): + query.execute(command) + self.connection.commit() + + + def executeManyFormattedQueriesIterator(self, table): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + try: + for command, values in table.getIterator(): + query.executeFormat(command, values) + self.connection.commit() + except: + for command, values in table.getIterator(): + query.execute(command, values) + self.connection.commit() + + + def executeFormattedQuery(self, command, parameters, insertion = False): + cursor = self.connection.cursor() + query = MySqlQuery(cursor, self.verbosity) + result = query.executeFormat(command, parameters) + self.connection.commit() + if insertion: + return result + else: + return query \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mySql/MySqlExonTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlExonTable.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,97 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.mySql.MySqlTable import MySqlTable + + +class MySqlExonTable(MySqlTable): + """A table of exon in a mySQL database""" + + def __init__(self, connection, name = None, chromosome = None, verbosity = 0): + if chromosome == None: + chromosome = "" + else: + chromosome = "_%s" % chromosome + if name == None: + name = "TmpTable_%d" % (random.randint(0, 100000)) + name = "%s%s_exons" % (name, chromosome) + super(MySqlExonTable, self).__init__(connection, name, verbosity) + + + def createExonTable(self): + variables = Interval.getSqlVariables() + variables.append("transcriptId") + types = Interval.getSqlTypes() + types["transcriptId"] = "int" + sizes = Interval.getSqlSizes() + sizes["transcriptId"] = 11 + self.create(variables, types, sizes) + + + def rename(self, name): + super(MySqlExonTable, self).rename("%s_exons" % name) + + + def addExon(self, exon, transcriptId): + values = exon.getSqlValues() + values["transcriptId"] = transcriptId + id = self.addLine(values) + exon.id = id + + + def retrieveExonsFromTranscriptId(self, transcriptId): + if not self.created: + return [] + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE transcriptId = %d" % (self.name, transcriptId)) + exons = [] + for exonLine in query.getIterator(): + exon = Interval() + exon.setSqlValues(exonLine) + exons.append(exon) + return exons + + + def retrieveExonsFromBulkTranscriptIds(self, transcriptIds): + if not transcriptIds: + return {} + if not self.created: + return {} + exons = dict([(transcriptId, []) for transcriptId in transcriptIds]) + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE transcriptId IN (%s)" % (self.name, ", ".join(["%s" % (transcriptId) for transcriptId in transcriptIds]))) + for exonLine in query.getIterator(): + exon = Interval() + exon.setSqlValues(exonLine) + exons[exonLine[-1]].append(exon) + return exons + + + def removeFromTranscriptId(self, transcriptId): + self.mySqlConnection.executeQuery("DELETE FROM %s WHERE transcriptId = %d" % (self.name, transcriptId)) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mySql/MySqlQuery.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlQuery.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,94 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class MySqlQuery(object): + """Query to a database""" + + def __init__(self, cursor, verbosity = 0): + self.verbosity = verbosity + self.cursor = cursor + self.insertedId = None + + + def __del__(self): + self.cursor.close() + + + def execute(self, query, insertion = False): + if self.verbosity > 99: + print "Querying %s" % (query) + try: + results = self.cursor.execute(query) + except Exception: + raise Exception("Error! Command \"%s\" failed!" % (query)) + if insertion: + return self.cursor.lastrowid + return results + + + def executeFormat(self, query, parameters): + if self.verbosity > 99: + print "Querying %s |" % (query), + for parameter in parameters: + print parameter, + results = self.cursor.execute(query, parameters) + return results + + + def getLine(self): + return self.cursor.fetchone() + + + def getLines(self, lines = None): + if lines == None: + return self.cursor.fetchall() + return self.cursor.fetchmany(lines) + + + def isEmpty(self): + self.getLines() + return self.cursor.rowcount == None or self.cursor.rowcount == 0 + + + def getInsertedId(self): + return self.insertedId + + + def getIterator(self): + line = self.getLine() + while line != None: + yield line + line = self.getLine() + + + def show(self): + for line in self.getIterator(): + print line \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mySql/MySqlTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlTable.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,349 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import re\n+import sys\n+\n+class MySqlTable(object):\n+ """\n+ Store a table of a mySQL database, used for transcripts or exons\n+ Record a a name and a type (int, float, double) for each column\n+ @ivar name: name of the table\n+ @type name: string\n+ @ivar variables: name of the columns\n+ @type variables: list of string\n+ @ivar types: type of the columns\n+ @type types: dict of string\n+ @ivar mySqlConnection: connection to a database\n+ @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}\n+ @ivar nbLines: number of rows\n+ @type nbLines: int\n+ @ivar verbosity: verbosity\n+ @type verbosity: int\n+ """\n+\n+ def __init__(self, connection, name, verbosity = 0):\n+ """\n+ Constructor\n+ Possibly retrieve column names and types if table exists\n+ @param mySqlConnection: connection to a databas\n+ @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}\n+ @param name: name of the table\n+ @type name: string\n+ @param verbosity: verbosity\n+ @type verbosity: int\n+ """\n+ self.name = name\n+ self.variables = []\n+ self.types = {}\n+ self.sizes = {}\n+ self.nbLines = None\n+ self.verbosity = verbosity\n+ self.mySqlConnection = connection\n+ queryTables = self.mySqlConnection.executeQuery("SELECT name FROM sqlite_master WHERE type LIKE \'table\' AND name LIKE \'%s\'" % (self.name))\n+ self.created = not queryTables.isEmpty()\n+ if self.created:\n+ queryFields = self.mySqlConnection.executeQuery("PRAGMA table_info(\'%s\')" % (name))\n+ for field in queryFields.getIterator():\n+ if field[1] != "id":\n+ self.variables.append(field[1])\n+ self.types[field[1]] = field[2]\n+ self.sizes[field[1]] = field[3]\n+ \n+ \n+ def getName(self):\n+ return self.name\n+\n+\n+ def create(self, variables, types, sizes):\n+ """\n+ Create a table using give column names and types\n+ @param variables: names of the columns\n+ @type variables: list of string\n+ @param types: types of the columns\n+ @type types: dict of string\n+ @param sizes: sizes of the types\n+ @type size'..b', id):\n+ """\n+ Retrieve a row from its id\n+ @param id: the id of the row\n+ @type id: int\n+ @return: the row\n+ """\n+ query = self.mySqlConnection.executeQuery("SELECT * FROM \'%s\' WHERE id = %d" % (self.name, id))\n+ result = query.getLine()\n+ if result == None:\n+ raise Exception("Error! Id %d is not in the table %s!" % (id, self.name))\n+ return result\n+\n+\n+ def retrieveBulkFromId(self, ids):\n+ """\n+ Retrieve a row from its id\n+ @param id: the ids of the row\n+ @type id: list of int\n+ @return: the row\n+ """\n+ if not ids:\n+ return []\n+ MAXSIZE = 1000\n+ results = []\n+ for batch in range(len(ids) / MAXSIZE + 1):\n+ theseIds = ids[batch * MAXSIZE : (batch+1) * MAXSIZE]\n+ if theseIds:\n+ query = self.mySqlConnection.executeQuery("SELECT * FROM \'%s\' WHERE id IN (%s)" % (self.name, ", ".join(["%d" % (id) for id in theseIds])))\n+ lines = query.getLines()\n+ if len(lines) != len(theseIds):\n+ raise Exception("Error! Some Ids of (%s) is are missing in the table \'%s\' (got %d instead of %d)!" % (", ".join(["%d" % (id) for id in theseIds]), self.name, len(lines)), len(theseIds))\n+ results.extend(lines)\n+ return results\n+\n+\n+ def removeFromId(self, id):\n+ """\n+ Remove a row from its id\n+ @param id: the id of the row\n+ @type id: int\n+ """\n+ self.mySqlConnection.executeQuery("DELETE FROM \'%s\' WHERE id = %d" % (self.name, id))\n+ \n+ \n+ def getIterator(self):\n+ """\n+ Iterate on the content of table\n+ @return: iterator to the rows of the table\n+ """\n+ if not self.created:\n+ return\n+ MAXSIZE = 1000\n+ query = self.mySqlConnection.executeQuery("SELECT count(id) FROM \'%s\'" % (self.name))\n+ nbRows = int(query.getLine()[0])\n+ for chunk in range((nbRows / MAXSIZE) + 1):\n+ query = self.mySqlConnection.executeQuery("SELECT * FROM \'%s\' LIMIT %d, %d" % (self.name, chunk * MAXSIZE, MAXSIZE))\n+ for line in query.getIterator():\n+ yield line\n+\n+\n+ def createIndex(self, indexName, values, unique = False, fullText = False):\n+ """\n+ Add an index on the table\n+ @param indexName: name of the index\n+ @type indexName: string\n+ @param values: values to be indexed\n+ @type values: string\n+ @param unique: if the index is unique\n+ @type unique: boolean\n+ @param fullText: whether full text should be indexed\n+ @type fullText: boolean\n+ """\n+ self.mySqlConnection.executeQuery("CREATE %s%sINDEX \'%s\' ON \'%s\' (%s)" % ("UNIQUE " if unique else "", "FULLTEXT " if fullText else "", indexName, self.name, ", ".join(values)))\n+\n+\n+ def setDefaultTagValue(self, field, name, value):\n+ """\n+ Add a tag value\n+ @param name: name of the tag\n+ @type name: string\n+ @param value: value of the tag\n+ @type value: string or int\n+ """\n+ newData = {}\n+ for line in MySqlTable.getIterator(self):\n+ id = line[0]\n+ tags = line[field]\n+ if tags == \'\':\n+ newTag = "%s=%s" % (name, value)\n+ else:\n+ newTag = "%s;%s=%s" % (tags, name, value)\n+ if name not in [tag.split("=")[0] for tag in tags.split(";")]:\n+ newData[id] = newTag\n+ for id, tag in newData.iteritems():\n+ query = self.mySqlConnection.executeQuery("UPDATE \'%s\' SET tags = \'%s\' WHERE id = %i" % (self.name, tag, id))\n+\n+\n+\n+ def show(self):\n+ """\n+ Drop the content of the current table\n+ """\n+ query = self.mySqlConnection.executeQuery("SELECT * FROM \'%s\'" % (self.name))\n+ print query.getLines()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/mySql/MySqlTranscriptTable.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mySql/MySqlTranscriptTable.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,149 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.mySql.MySqlExonTable import MySqlExonTable +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress + +class MySqlTranscriptTable(MySqlTable): + """A table of transcripts in a mySQL database""" + + def __init__(self, connection, name = None, chromosome = None, verbosity = 0): + if chromosome == None: + chromosome = "" + else: + chromosome = "_%s" % chromosome + if name == None: + name = "TmpTable_%d" % (random.randint(0, 100000)) + name = "%s%s" % (name, chromosome) + super(MySqlTranscriptTable, self).__init__(connection, "%s_transcripts" % name, verbosity) + + + def createTranscriptTable(self): + self.create(Transcript.getSqlVariables(), Transcript.getSqlTypes(), Transcript.getSqlSizes()) + + + def rename(self, name): + super(MySqlTranscriptTable, self).rename("%s_transcripts" % name) + + + def remove(self): + super(MySqlTranscriptTable, self).remove() + + + def clear(self): + super(MySqlTranscriptTable, self).clear() + + + def copy(self, transcriptTable): + self.remove() + super(MySqlTranscriptTable, self).copy(transcriptTable) + + + def add(self, transcriptTable): + super(MySqlTranscriptTable, self).add(transcriptTable) + + + def addTranscript(self, transcript): + id = self.addLine(transcript.getSqlValues()) + transcript.id = id + + + def addTranscriptList(self, transcriptList): + progress = Progress(transcriptList.getNbTranscript(), "Storing list to %s" % (self.name), self.verbosity) + for transcript in transcriptList.getIterator(): + self.addTranscript(transcript) + progress.inc() + progress.done() + + + def removeTranscript(self, transcript): + self.removeFromId(transcript.id) + + + def retrieveTranscriptFromId(self, id): + transcript = Transcript() + transcript.setSqlValues(self.retrieveFromId(id)) + return transcript + + + def retrieveBulkTranscriptFromId(self, ids): + if not ids: + return [] + transcripts = self.retrieveBulkFromId(ids) + idsToTranscripts = {} + for values in transcripts: + transcript = Transcript() + transcript.setSqlValues(values) + idsToTranscripts[values[0]] = transcript + return idsToTranscripts.values() + + + def selectTranscripts(self, command, simple = False): + MAXSIZE = 100000 + found = True + cpt = 0 + while found: + found = False + if simple: + thisCommand = command + else: + thisCommand = "%s LIMIT %d OFFSET %d" % (command, MAXSIZE, MAXSIZE * cpt) + query = self.mySqlConnection.executeQuery(thisCommand) + for line in query.getIterator(): + found = True + id = int(line[0]) + transcript = Transcript() + transcript.setSqlValues(line) + yield (id, transcript) + cpt += 1 + if simple: + return + + + def getIterator(self): + for id, transcript in self.selectTranscripts("SELECT * FROM '%s'" % (self.name)): + yield transcript + + + def retrieveTranscriptList(self): + transcriptList = TranscriptList() + for transcriptLine in self.getLines(): + transcript = Transcript() + transcript.setSqlValues(transcriptLine) + transcriptList.addTranscript(transcript) + return transcriptList + + + def setDefaultTagValue(self, name, value): + super(MySqlTranscriptTable, self).setDefaultTagValue(Transcript.getSqlVariables().index("tags")+1, name, value) \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/.NCList.py.swp |
b |
Binary file SMART/Java/Python/ncList/.NCList.py.swp has changed |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/.NCListCursor.py.swp |
b |
Binary file SMART/Java/Python/ncList/.NCListCursor.py.swp has changed |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/Benchmark.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/Benchmark.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,357 @@\n+import os, os.path, random, glob, subprocess, threading, time, resource\n+from optparse import OptionParser\n+from SMART.Java.Python.misc.Progress import *\n+from SMART.Java.Python.getRandomRegions import RandomRegionsGenerator\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from commons.core.parsing.GffParser import GffParser\n+\n+#TYPES = ("bin", "has", "seg", "fj", "nc", "new")\n+TYPES = ("new", )\n+\n+class RunCmd(threading.Thread):\n+\tdef __init__(self, cmd, out, err, time, memory):\n+\t\tthreading.Thread.__init__(self)\n+\t\tself._cmd = cmd\n+\t\tself._out = out\n+\t\tself._err = err\n+\t\tself._time = time\n+\t\tself._memory = memory\n+\t\tself._id\t = os.getpid()\n+\t\tself._mem = 0.0\n+\t\tself._outputFileName = "tmp_%d.out" % (self._id)\n+\n+\tdef run(self):\n+\t\tself._p = subprocess.Popen(self._cmd, stdout = self._out, stderr = self._err, shell = True)\n+\t\t#self._p.wait()\n+\n+\tdef _runShellCommand(self, command):\n+\t\tp = subprocess.call("%s > %s" % (command, self._outputFileName), shell=True)\n+\t\thandle = open(self._outputFileName)\n+\t\tdata = [line.split() for line in handle.readlines()[1:] if line]\n+\t\thandle.close()\n+\t\tos.remove(self._outputFileName)\n+\t\treturn data\n+\n+\tdef _getPid(self):\n+\t\tself._pid\t = None\n+\t\tcpt = 1\n+\t\twhile True:\n+\t\t\tcommandsFound = []\n+\t\t\tfor line in self._runShellCommand("ps -o pid,cmd"):\n+\t\t\t\tif line[1:] == self._cmd.split(" "):\n+\t\t\t\t\tself._pid = int(line[0])\n+\t\t\t\tcommandsFound.append(" ".join(line[1:]))\n+\t\t\tif self._pid != None:\n+\t\t\t\treturn True\n+\t\t\ttime.sleep(1)\n+\t\t\tif cpt % 100 == 0:\n+\t\t\t\tprint "pid of \'%s\' not found after %d seconds. Found: %s" % (self._cmd, cpt, " --- ".join(commandsFound))\n+\t\t\tcpt += 1\n+\t\t\tif cpt > 300:\n+\t\t\t\treturn False\n+\n+\tdef _fetchMemory(self):\n+\t\tlines = self._runShellCommand("ps u -p %d" % (self._pid))\n+\t\tfor line in lines:\n+\t\t\tself._mem = max(self._mem, float(line[3]))\n+\t\t\treturn self._mem >= self._memory\n+\t\t#print "Cannot find the memory of the current PID (%d) in: %s" % (self._pid, " --- ".join([" ".join(line) for line in lines]))\n+\t\treturn False\n+\t\n+\tdef getMemory(self):\n+\t\treturn self._mem\n+\n+\tdef _abort(self):\n+\t\ttry:\n+\t\t\tself._p.terminate()\n+\t\texcept Exception:\n+\t\t\tpass\n+\t\tself._killSubThreads()\n+\t\n+\tdef _killSubThreads(self):\n+\t\tfor line in self._runShellCommand("ps --ppid %d -o pid" % (self._pid)):\n+\t\t\tself._runShellCommand("kill %s" % (line[0]))\n+\t\tself._runShellCommand("kill %s" % (self._pid))\n+\n+\tdef go(self):\n+\t\tstartTime = time.time()\n+\t\tself.run()\n+\t\t#self.start()\n+\t\twhile not self._getPid():\n+\t\t\t#self.start()\n+\t\t\tself.run()\n+\t\twhile True:\n+\t\t\tif self._time != None and time.time() - startTime > self._time:\n+\t\t\t\tprint "\\nCommand \'%s\' did not finish in time. Aborting it." % (self._cmd)\n+\t\t\t\tself._abort()\n+\t\t\t\tbreak\n+\t\t\tif self._memory != None and self._fetchMemory():\n+\t\t\t\tprint "\\nCommand \'%s\' required too much memory (%f). Aborting it." % (self._cmd, self._mem)\n+\t\t\t\tself._abort()\n+\t\t\t\tbreak\n+\t\t\t#self.join(0.1)\n+\t\t\ttime.sleep(0.1)\n+\t\t\t#if not self.isAlive():\n+\t\t\tif self._p.poll() != None:\n+\t\t\t\treturn True\n+\t\treturn False\n+\n+\n+class DataStructure(object):\n+\tdef __init__(self):\n+\t\tself._structure = {}\n+\n+\tdef addData(self, data):\n+\t\tif data._nbRefs not in self._structure:\n+\t\t\tself._structure[data._nbRefs] = {}\n+\t\tif data._nbQueries not in self._structure[data._nbRefs]:\n+\t\t\tself._structure[data._nbRefs][data._nbQueries] = {}\n+\t\tif data._genomeSize not in self._structure[data._nbRefs][data._nbQueries]:\n+\t\t\tself._structure[data._nbRefs][data._nbQueries][data._genomeSize] = {}\n+\t\tif data._type not in self._structure[data._nbRefs][data._nbQueries][data._genomeSize]:\n+\t\t\tself._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type] = []\n+\t\tself._structure[data._nbRefs][data._nbQueries][data._genomeSize][data._type].append(data._group)\n+\n+\tdef export(self):\n+\t\toutputString = "#refs\\t#queries\\tgenome size\\ttype\\t# written\\t# overlaps\\tbuild t.\\trun t.\\tmem\\n"\n+\t\tf'..b' nbReferences, nbQueries, "NA", "NA", genomeSize)\n+\t\t\t\t\t\t\telse:\n+\t\t\t\t\t\t\t\tdata[type] = self._parseTrace(type, fileName, genomeSize)\n+\t\t\t\t\t\t\t\tself._structure.addData(data[type])\n+\t\t\t\t\t\t\t\tos.remove(fileName)\n+\t\t\t\t\t\t\tself._cleanTmpFiles()\n+\t\t\t\t\t\tself._cleanTmpFiles(True)\n+\t\t\t\t\t\tfirstType = TYPES[0]\n+\t\t\t\t\t\tfor type in TYPES[1:]:\n+\t\t\t\t\t\t\tif not data[firstType].checkConsistency(data[type]):\n+\t\t\t\t\t\t\t\traise Exception("Outputs are not consistent.\\n # outputs: %d vs %d.\\n # overlaps: %d vs %d.\\n %s: %f + %f; %s: %f + %f.\\n Files are %s and %s." % (data[firstType]._group._nbOutputs, data[type]._group._nbOutputs, data[firstType]._group._nbOverlaps, data[type]._group._nbOverlaps, firstType, data[firstType]._group._buildTime, data[firstType]._group._runTime, data[firstType]._group._mem, type, data[type]._group._buildTime, data[type]._group._runTime, data[type]._group._mem, refFileName, queryFileName))\n+\t\t\t\t\t\tfor fileName in (queryFileName, refFileName):\n+\t\t\t\t\t\t\tif os.path.exists(fileName):\n+\t\t\t\t\t\t\t\tos.remove(fileName)\n+\t\t\t\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\t\thandle = open(self._outputFileName, "w")\n+\t\thandle.write(self._structure.export())\n+\t\thandle.close()\n+\n+\n+\n+if __name__ == "__main__":\n+\t\n+\tdescription = "Benchmark v1.0.2: Compare NC-List with other tools. Only work under Linux. [Category: Other]"\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-r", "--nbReferences", dest="nbReferences", action="store", default=None, type="string", help="number of references (list of integers separated by commas) [compulsory] [format: string]")\n+\tparser.add_option("-q", "--nbQueries", dest="nbQueries",\t\t action="store", default=None, type="string", help="number of queries as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")\n+\tparser.add_option("-R", "--nbReplicates", dest="nbReplicates", action="store", default=None, type="int",\thelp="number of replicates [compulsory] [format: int]")\n+\tparser.add_option("-s", "--genomeSizes", dest="genomeSizes", action="store", default=None, type="string", help="genome size as a factor of the number of references (list of floats separated by commas) [compulsory] [format: string]")\n+\tparser.add_option("-c", "--chromosome", dest="chromosome",\t action="store", default="chr1", type="string", help="name of the chromosome [default: chr1] [format: string]")\n+\tparser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int",\thelp="minimum size of the reads [compulsory] [format: int]")\n+\tparser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int",\thelp="maximum size of the reads [compulsory] [format: int]")\n+\tparser.add_option("-o", "--output", dest="outputFileName", action="store",\t\t\t\t type="string", help="output file [compulsory] [format: output file in TXT format]")\n+\tparser.add_option("-t", "--time", dest="time", action="store", default=None, type="int",\thelp="maximum time to wait (in seconds) [default: None] [format: int]")\n+\tparser.add_option("-m", "--memory",\t dest="memory",\t\t action="store", default=None, type="float",\thelp="maximum memory usage (in %) [default: None] [format: float]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1,\t type="int",\thelp="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tbenchmark = Benchmark(options.verbosity)\n+\tbenchmark.setNbReferences(map(int, options.nbReferences.split(",")))\n+\tbenchmark.setNbQueries(map(float, options.nbQueries.split(",")))\n+\tbenchmark.setGenomeSizes(map(float, options.genomeSizes.split(",")))\n+\tbenchmark.setNbReplicates(options.nbReplicates)\n+\tbenchmark.setChromosomeName(options.chromosome)\n+\tbenchmark.setSizes(options.minSize, options.maxSize)\n+\tbenchmark.setLimits(options.time, options.memory)\n+\tbenchmark.setOutputFileName(options.outputFileName)\n+\tbenchmark.run()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/ConvertToNCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/ConvertToNCList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,172 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import random, os, time, shutil +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.FileSorter import FileSorter +from SMART.Java.Python.ncList.NCListMerger import NCListMerger +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + +class ConvertToNCList(object): + + def __init__(self, verbosity = 1): + self._parsers = {} + self._sortedFileNames = {} + self._inputFileName = None + self._outputFileName = None + self._index = False + self._ncLists = {} + self._splittedFileNames = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._randomNumber = random.randint(0, 10000) + self._sorted = False + self._verbosity = verbosity + + def setInputFileName(self, fileName, format): + self._inputFileName = fileName + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parser = chooser.getParser(fileName) + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + fileNameNoExtension = os.path.splitext(fileName)[0] + baseName = "%s_%d" % (fileNameNoExtension, self._randomNumber) + self._directory = "%s_files" % (baseName) + if not os.path.exists(self._directory): + os.makedirs(self._directory) + self._sortedFileNames = os.path.join(self._directory, baseName) + + def setIndex(self, boolean): + self._index = boolean + + def setSorted(self, boolean): + self._sorted = boolean + + def sortFile(self): + if self._verbosity > 2: + print "%s file %s..." % ("Rewriting" if self._sorted else "Sorting", self._inputFileName) + startTime = time.time() + fs = FileSorter(self._parser, self._verbosity-4) + fs.setPresorted(self._sorted) + fs.perChromosome(True) + fs.setOutputFileName(self._sortedFileNames) + fs.sort() + self._splittedFileNames = fs.getOutputFileNames() + self._nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self._nbElements = fs.getNbElements() + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def createNCLists(self): + self._ncLists = {} + if self._verbosity > 2: + print "Creating NC-list for %s..." % (self._inputFileName) + startTime = time.time() + for chromosome, fileName in self._splittedFileNames.iteritems(): + if self._verbosity > 3: + print " chromosome %s" % (chromosome) + ncList = NCList(self._verbosity) + if self._index: + ncList.createIndex(True) + ncList.setChromosome(chromosome) + ncList.setFileName(fileName) + ncList.setNbElements(self._nbElementsPerChromosome[chromosome]) + ncList.buildLists() + self._ncLists[chromosome] = ncList + endTime = time.time() + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def writeOutputFile(self): + merger = NCListMerger(self._verbosity) + merger.setFileName(self._outputFileName) + merger.addIndex(self._index) + merger.setNCLists(self._ncLists) + merger.merge() + + def cleanFiles(self): + shutil.rmtree(self._directory) + + def run(self): + self.sortFile() + self.createNCLists() + self.writeOutputFile() + self.cleanFiles() + + def getSortedFileNames(self): + return self._splittedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def getTmpDirectory(self): + return self._directory + + +if __name__ == "__main__": + description = "Convert To NC-List v1.0.0: Convert a mapping or transcript file into a NC-List. [Category: NC-List]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-d", "--index", dest="index", action="store_true", default=False, help="create an index [default: false] [format: boolean]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in NCList format]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input file is already sorted [format: boolean] [default: False]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + ctncl = ConvertToNCList(options.verbosity) + ctncl.setInputFileName(options.inputFileName, options.format) + ctncl.setOutputFileName(options.outputFileName) + ctncl.setIndex(options.index) + ctncl.setSorted(options.sorted) + ctncl.run() + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/FileSorter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FileSorter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,210 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +try: + import cPickle as pickle +except: + import pickle +import random, os +from heapq import heapify, heappop, heappush +from itertools import islice, cycle +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +BUFFER_SIZE = 100 * 1024 + +class FileSorter(object): + + def __init__(self, parser, verbosity = 1): + self._parser = parser + self._verbosity = verbosity + self._chunks = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._perChromosome = False + self._isPreSorted = False + self._outputFileNames = {} + self._prefix = "tmpFile_%d" % (random.randint(0, 100000)) + self._chromosome = None + if "SMARTTMPPATH" in os.environ: + self._prefix = os.path.join(os.environ["SMARTTMPPATH"], self._prefix) + + def selectChromosome(self, chromosome): + self._chromosome = chromosome + + def perChromosome(self, boolean): + self._perChromosome = boolean + + def setOutputFileName(self, fileName): + self._outputFileName = fileName + if self._perChromosome: + self._outputFileName = os.path.splitext(self._outputFileName)[0] + + def setPresorted(self, presorted): + self._isPreSorted = presorted + + def sort(self): + if not self._isPreSorted: + self._batchSort() + else: + self._presorted() + + def _presorted(self): + progress = UnlimitedProgress(1000, "Writing files %s" % (self._parser.fileName), self._verbosity) + curChromosome = None + outputHandle = None + + if not self._perChromosome: + outputHandle = open(self._outputFileName, "wb") + for transcript in self._parser.getIterator(): + progress.inc() + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + if self._chromosome != None and chromosome != self._chromosome: + continue + self._nbElements += 1 + self._nbElementsPerChromosome[chromosome] = self._nbElementsPerChromosome.get(chromosome, 0) + 1 + if self._perChromosome: + if chromosome != curChromosome: + if outputHandle != None: + outputHandle.close() + self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) + outputHandle = open(self._outputFileNames[chromosome], "wb") + curChromosome = chromosome + outputHandle.writelines("%s" % pickle.dumps(transcript)) + if outputHandle != None: + outputHandle.close() + progress.done() + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def _printSorted(self, chromosome, chunk): + chunk.sort(key = lambda transcript: (transcript.getStart(), -transcript.getEnd())) + outputChunk = open("%s_%s_%06i.tmp" % (self._prefix, chromosome, len(self._chunks[chromosome])), "wb", 32000) + self._chunks[chromosome].append(outputChunk) + for transcript in chunk: + outputChunk.write(pickle.dumps(transcript, -1)) + outputChunk.close() + + def _merge(self, chunks): + values = [] + for chunk in chunks: + chunk = open(chunk.name, "rb") + try: + transcript = pickle.load(chunk) + start = transcript.getStart() + end = -transcript.getEnd() + except EOFError: + try: + chunk.close() + chunks.remove(chunk) + os.remove(chunk.name) + except: + pass + else: + heappush(values, (start, end, transcript, chunk)) + while values: + start, end, transcript, chunk = heappop(values) + yield transcript + try: + transcript = pickle.load(chunk) + start = transcript.getStart() + end = -transcript.getEnd() + except EOFError: + try: + chunk.close() + chunks.remove(chunk) + os.remove(chunk.name) + except: + pass + else: + heappush(values, (start, end, transcript, chunk)) + + def _batchSort(self): + currentChunks = {} + counts = {} + try: + progress = UnlimitedProgress(1000, "Sorting file %s" % (self._parser.fileName), self._verbosity) + for transcript in self._parser.getIterator(): + progress.inc() + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + if self._chromosome != None and chromosome != self._chromosome: + continue + if chromosome not in self._chunks: + self._chunks[chromosome] = [] + currentChunks[chromosome] = [] + counts[chromosome] = 0 + currentChunks[chromosome].append(transcript) + counts[chromosome] += 1 + if counts[chromosome] == BUFFER_SIZE: + self._printSorted(chromosome, currentChunks[chromosome]) + currentChunks[chromosome] = [] + counts[chromosome] = 0 + self._nbElements += 1 + self._nbElementsPerChromosome[chromosome] = self._nbElementsPerChromosome.get(chromosome, 0) + 1 + for chromosome in self._chunks: + if counts[chromosome] > 0: + self._printSorted(chromosome, currentChunks[chromosome]) + progress.done() + if not self._perChromosome: + outputHandle = open(self._outputFileName, "wb") + progress = Progress(len(self._chunks), "Writing sorted file %s" % (self._parser.fileName), self._verbosity) + for chromosome in self._chunks: + if self._perChromosome: + self._outputFileNames[chromosome] = "%s_%s.pkl" % (self._outputFileName, chromosome) + outputHandle = open(self._outputFileNames[chromosome], "wb") + for sequence in self._merge(self._chunks[chromosome]): + pickle.dump(sequence, outputHandle, -1) + if self._perChromosome: + outputHandle.close() + progress.inc() + if not self._perChromosome: + outputHandle.close() + progress.done() + finally: + for chunks in self._chunks.values(): + for chunk in chunks: + try: + chunk.close() + os.remove(chunk.name) + except Exception: + pass + + def getOutputFileNames(self): + return self._outputFileNames |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithOneInterval.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,197 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct +import math +import os +from optparse import OptionParser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.FileSorter import FileSorter +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.structure.Transcript import Transcript + +LONGSIZE = struct.calcsize('l') + +class FindOverlapsWithOneInterval(object): + + def __init__(self, verbosity): + self._sortedFileName = None + self._verbosity = verbosity + self._overlappingNames = [] + self._nbOverlaps = 0 + self._nbWritten = 0 + + def __del__(self): + if self._sortedFileName and os.path.exists(self._sortedFileName): + os.remove(self._sortedFileName) + + def close(self): + self._iWriter.close() + + def setOutputFileName(self, fileName): + self._iWriter = Gff3Writer(fileName) + + def setFileName(self, fileName, format): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._parser = chooser.getParser(fileName) + self._sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + + def setInterval(self, chromosome, start, end): + self._chromosome = chromosome + self._start = start + self._end = end + self._transcript = Transcript() + self._transcript.setChromosome(chromosome) + self._transcript.setStart(start) + self._transcript.setEnd(end) + self._transcript.setDirection("+") + + def setTranscript(self, transcript): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + self._chromosome = transcript.getChromosome() + self._start = transcript.getStart() + self._end = transcript.getEnd() + self._transcript = transcript + + def prepareIntermediateFiles(self): + fs = FileSorter(self._parser, self._verbosity-4) + fs.selectChromosome(self._chromosome) + fs.perChromosome(False) + fs.setOutputFileName(self._sortedFileName) + fs.sort() + self._nbTotalLines = fs.getNbElements() + self._nbLines = fs.getNbElementsPerChromosome()[self._chromosome] + + def createNCList(self): + if self._verbosity > 2: + print "Creating NC-list..." + ncList = NCList(self._verbosity) + ncList.createIndex(True) + ncList.setChromosome(self._chromosome) + ncList.setFileName(self._sortedFileName) + ncList.setNbElements(self._nbTotalLines) + ncList.buildLists() + self.setNCList(ncList, ncList.getIndex()) + if self._verbosity > 2: + print " ...done (%ds)" % (endTime - startTime) + + def setNCList(self, ncList, index): + self._ncList = ncList + self._indix = index + + def binarySearch(self, cursor, startL, endL): + if startL > endL: + return None + middleL = (startL + endL) / 2 + cursor.moveSibling(middleL) + overlap = self.isOverlapping(cursor) + if overlap == 0: + if middleL == startL: + return cursor + else: + return self.binarySearch(cursor, startL, middleL) + if overlap == -1: + return self.binarySearch(cursor, middleL + 1, endL) + return self.binarySearch(cursor, startL, middleL - 1) + + def compare(self, cursor = None): + self._ncList.openFiles() + if cursor == None: + dump = True + cursor = NCListCursor(None, self._ncList, 0, self._verbosity) + cursor._getSiblingData() + cursor = self.binarySearch(cursor, cursor._firstSiblingLIndex, cursor._lastSiblingLIndex) + if cursor == None: + return + while not cursor.isOut() and self.isOverlapping(cursor) == 0: + self.write(cursor) + newCursor = NCListCursor(cursor) + if newCursor.hasChildren(): + newCursor.moveDown() + self.compare(newCursor) + if cursor.isLast(): + return + cursor.moveRight() + + def isOverlapping(self, cursor): + if self._end < cursor.getStart(): + return 1 + if self._start > cursor.getEnd(): + return -1 + return 0 + + def write(self, cursor): + self._nbOverlaps += 1 + refTranscript = cursor.getTranscript() + self._overlappingNames.append(refTranscript.getName()) + + def dumpWriter(self): + if (not self._overlappingNames) or self._transcript == None: + return + self._transcript.setTagValue("nbOverlaps", len(self._overlappingNames)) + self._transcript.setTagValue("overlapsWith", "--".join(self._overlappingNames)) + self._iWriter.addTranscript(self._transcript) + self._nbWritten += 1 + self._overlappingNames = [] + + def run(self): + self.prepareIntermediateFiles() + self.createNCList() + self.compare() + self.dumpWriter() + self.close() + if self._verbosity > 0: + print "# refs: %d" % (self._nbLines) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + + +if __name__ == "__main__": + description = "FindOverlapsWithOneInterval: Finds overlaps with one query interval." + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="Input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="Format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-s", "--start", dest="start", action="store", type="int", help="The start of the query interval [compulsory] [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", type="int", help="The end of the query interval [compulsory] [format: int]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", type="string", help="Chromosome of the query interval [compulsory] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + iFOWOI = FindOverlapsWithOneInterval(options.verbosity) + iFOWOI.setFileName(options.inputFileName, options.format) + iFOWOI.setInterval(options.chromosome, options.start, options.end) + iFOWOI.setOutputFileName(options.outputFileName) + iFOWOI.run() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,182 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+\n+import os, struct, time\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.ncList.NCList import NCList\n+from SMART.Java.Python.ncList.NCListCursor import NCListCursor\n+from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle\n+from SMART.Java.Python.ncList.FileSorter import FileSorter\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+from SMART.Java.Python.ncList.NCListCursor import NCListCursor\n+from SMART.Java.Python.ncList.FindOverlapsWithOneInterval import FindOverlapsWithOneInterval\n+\n+REFERENCE = 0\n+QUERY = 1\n+TYPETOSTRING = {0: "reference", 1: "query"}\n+\n+class FindOverlapsWithSeveralIntervals(object):\n+ \n+ def __init__(self, verbosity = 1):\n+ self._parsers = {}\n+ self._outputFileName = "outputOverlaps.gff3"\n+ self._iWriter = None\n+ self._nbLines = {REFERENCE: 0, QUERY: 0}\n+ self._verbosity = verbosity\n+ self._ncLists = {}\n+ self._sortedRefFileNames = None\n+ self._transQueryFileName = None\n+ self._cursors = {}\n+ self._iFowoi = FindOverlapsWithOneInterval(self._verbosity)\n+ \n+ def __del__(self):\n+ self.close()\n+ for fileName in (self._sortedRefFileNames, self._transQueryFileName):\n+ if os.path.exists(fileName):\n+ os.remove(fileName)\n+ \n+ def close(self):\n+ self._iFowoi.close()\n+ \n+ def setRefFileName(self, fileName, format):\n+ self.setFileName(fileName, format, REFERENCE)\n+ self._sortedRefFileNames = "%s_ref_sorted.pkl" % (os.path.splitext(fileName)[0])\n+ \n+ def setQueryFileName(self, fileName, format):\n+ self.setFileName(fileName, format, QUERY)\n+ self._transQueryFileName = "%s_query_trans.pkl" % (os.path.splitext(fileName)[0])\n+\n+ def setFileName(self, fileName, format, type):\n+ chooser = ParserChooser(self._verbosity)\n+ chooser.findFormat(format)\n+ self._parsers[type] = chooser.getParser(fileName)\n+ \n+ def setOutputFileName(self, outputFileName):\n+ self._iFowoi.setOutputFileName(outputFileName)\n+\n+ def _sortRefFile(self):\n+ fs = FileSorter(self._p'..b'\n+ self._sortRefFile()\n+ self._translateQueryFile()\n+\n+ def createNCLists(self):\n+ self._ncLists = {}\n+ self._indices = {}\n+ self._cursors = {}\n+ for chromosome, fileName in self._splittedFileNames.iteritems():\n+ if self._verbosity > 3:\n+ print " chromosome %s" % (chromosome)\n+ ncList = NCList(self._verbosity)\n+ ncList.createIndex(True)\n+ ncList.setChromosome(chromosome)\n+ ncList.setFileName(fileName)\n+ ncList.setNbElements(self._nbRefLinesPerChromosome[chromosome])\n+ ncList.buildLists()\n+ self._ncLists[chromosome] = ncList\n+ cursor = NCListCursor(None, ncList, 0, self._verbosity)\n+ self._cursors[chromosome] = cursor\n+ self._indices[chromosome] = ncList.getIndex()\n+ endTime = time.time()\n+\n+ def compare(self):\n+ progress = Progress(self._nbLines[QUERY], "Comparing data", self._verbosity-3)\n+ startTime = time.time()\n+ for cpt, queryTranscript in enumerate(self._parsers[QUERY].getIterator()):\n+ chromosome = queryTranscript.getChromosome()\n+ if chromosome not in self._ncLists:\n+ continue\n+ self._iFowoi.setNCList(self._ncLists[chromosome], self._indices[chromosome])\n+ self._iFowoi.setTranscript(queryTranscript)\n+ self._iFowoi.compare()\n+ self._iFowoi.dumpWriter()\n+ progress.inc()\n+ progress.done()\n+ endTime = time.time()\n+ self._timeSpent = endTime - startTime\n+\n+ def run(self):\n+ startTime = time.time()\n+ if self._verbosity > 2:\n+ print "Creating NC-list..."\n+ self.prepareIntermediateFiles()\n+ self.createNCLists()\n+ endTime = time.time()\n+ if self._verbosity > 2:\n+ print " ...done (%.2gs)" % (endTime - startTime)\n+ self.compare()\n+ self.close()\n+ if self._verbosity > 0:\n+ print "# queries: %d" % (self._nbLines[QUERY])\n+ print "# refs: %d" % (self._nbLines[REFERENCE])\n+ print "# written: %d (%d overlaps)" % (self._iFowoi._nbWritten, self._iFowoi._nbOverlaps)\n+ print "time: %.2gs" % (self._timeSpent)\n+\n+\n+if __name__ == "__main__":\n+ description = "FindOverlaps With Several Intervals v1.0.0: Finds overlaps with several query intervals. [Category: Data comparison]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="Query input file [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+ parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="Reference input file [compulsory] [format: file in transcript format given by -g]")\n+ parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="Output file [compulsory] [format: output file in GFF3 format]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]")\n+ (options, args) = parser.parse_args()\n+ \n+ iFWSI = FindOverlapsWithSeveralIntervals(options.verbosity)\n+ iFWSI.setRefFileName(options.inputRefFileName, options.refFormat)\n+ iFWSI.setQueryFileName(options.inputQueryFileName, options.queryFormat)\n+ iFWSI.setOutputFileName(options.outputFileName)\n+ iFWSI.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsBin.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,204 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2011\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import random, os, os.path, time, sqlite3\n+from optparse import OptionParser\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from commons.core.writer.TranscriptWriter import TranscriptWriter\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Mapping import Mapping\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress\n+try:\n+ import cPickle as pickle\n+except:\n+ import pickle\n+\n+MINBIN = 3\n+MAXBIN = 7\n+\n+\n+def getBin(start, end):\n+\tfor i in range(MINBIN, MAXBIN + 1):\n+\t\tbinLevel = 10 ** i\n+\t\tif int(start / binLevel) == int(end / binLevel):\n+\t\t\treturn int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))\n+\treturn int((MAXBIN + 1) * 10 ** (MAXBIN + 1))\n+\n+def getOverlappingBins(start, end):\n+\tarray\t= []\n+\tbigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))\n+\tfor i in range(MINBIN, MAXBIN + 1):\n+\t\tbinLevel = 10 ** i\n+\t\tarray.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))\n+\tarray.append((bigBin, bigBin))\n+\treturn array\n+\n+\n+class FindOverlapsWithSeveralIntervalsBin(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself.verbosity\t= verbosity\n+\t\tself.randomNumber = random.randint(0, 10000)\n+\t\tself.dbName\t = "smartdb%d" % (self.randomNumber)\n+\t\tif "SMARTTMPPATH" in os.environ:\n+\t\t\tself.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName)\n+\t\tself.connection = sqlite3.connect(self.dbName)\n+\t\tself.tableNames = {}\n+\t\tself.nbQueries = 0\n+\t\tself.nbRefs\t = 0\n+\t\tself.nbWritten = 0\n+\t\tself.nbOverlaps = 0\n+\t\tcursor = self.connection.cursor()\n+\t\tcursor.execute("PRAGMA journal_mode = OFF")\n+\t\tcursor.execute("PRAGMA synchronous = 0")\n+\t\tcursor.execute("PRAGMA locking_mode = EXCLUSIVE")\n+\t\tcursor.execute("PRAGMA count_change = OFF")\n+\t\tcursor.execute("PRAGMA temp_store = 2")\n+\n+\tdef __del__(self):\n+\t\tcursor = self.connection.cursor()\n+\t\tfor tableName in self.tableNames.values():\n+\t\t\tcursor.execute("DROP TABLE IF EXISTS %s" % (tableName))\n+\t\tif os.path.exists(self.dbName):\n+\t\t\tos.remove(self.dbName)\n+\t\t\n+\tdef createTable(self, chromosome):\n+\t\tcursor = self.connection.cursor()\n+\t\ttableName = "tmpTable_%s_%d" % (chromosome.replace("-", "_"), self.randomNumber)\n+\t\tcursor.execute("CREATE TABLE %s (start INT, end INT, transcript BLOB, bin INT)" % (tableName))\n+\t\tcursor.execute("CRE'..b'ursor\t = self.connection.cursor()\n+\t\t\tcursor.execute("INSERT INTO %s (start, end, transcript, bin) VALUES (?, ?, ?, ?)" % (self.tableNames[chromosome]), (start, end, sqlite3.Binary(transcriptString), bin))\n+\t\t\tself.nbRefs += 1\n+\t\tself.connection.commit()\n+\t\tendTime = time.time()\n+\t\tif self.verbosity > 2:\n+\t\t\tprint "\t...done (%.2gs)" % (endTime - startTime)\n+\n+\tdef setQueryFile(self, fileName, format):\n+\t\tchooser = ParserChooser(self.verbosity)\n+\t\tchooser.findFormat(format)\n+\t\tself.queryParser = chooser.getParser(fileName)\n+\t\tself.nbQueries = self.queryParser.getNbItems()\n+\n+\tdef setOutputFile(self, fileName):\n+\t\tself.writer = TranscriptWriter(fileName, "gff3", self.verbosity)\n+\n+\tdef compare(self):\n+\t\tprogress = Progress(self.nbQueries, "Reading queries", self.verbosity)\n+\t\tstartTime = time.time()\n+\t\tfor queryTranscript in self.queryParser.getIterator():\n+\t\t\tif queryTranscript.__class__.__name__ == "Mapping":\n+\t\t\t\tqueryTranscript = queryTranscript.getTranscript()\n+\t\t\tprogress.inc()\n+\t\t\tqueryChromosome = queryTranscript.getChromosome()\n+\t\t\tif queryChromosome not in self.tableNames:\n+\t\t\t\tcontinue\n+\t\t\tqueryStart = queryTranscript.getStart()\n+\t\t\tqueryEnd = queryTranscript.getEnd()\n+\t\t\tbins\t = getOverlappingBins(queryStart, queryEnd)\n+\t\t\tcommands = []\n+\t\t\tfor bin in bins:\n+\t\t\t\tcommand = "SELECT * FROM %s WHERE bin " % (self.tableNames[queryChromosome])\n+\t\t\t\tif bin[0] == bin[1]:\n+\t\t\t\t\tcommand += "= %d" % (bin[0])\n+\t\t\t\telse:\n+\t\t\t\t\tcommand += "BETWEEN %d AND %d" % (bin[0], bin[1])\n+\t\t\t\tcommands.append(command)\n+\t\t\tcommand = " UNION ".join(commands)\n+\t\t\tcursor = self.connection.cursor()\n+\t\t\tcursor.execute(command)\n+\t\t\toverlap = False\n+\t\t\tline\t= cursor.fetchone()\n+\t\t\twhile line:\n+\t\t\t\trefStart, refEnd, refTranscriptString, refBin = line\n+\t\t\t\tif refStart <= queryEnd and refEnd >= queryStart:\n+\t\t\t\t\trefTranscript = pickle.loads(str(refTranscriptString))\n+\t\t\t\t\tif refTranscript.overlapWith(queryTranscript):\n+\t\t\t\t\t\toverlap = True\n+\t\t\t\t\t\tself.nbOverlaps += 1\n+\t\t\t\tline = cursor.fetchone()\n+\t\t\tif overlap:\n+\t\t\t\tself.writer.addTranscript(queryTranscript)\n+\t\t\t\tself.nbWritten += 1\n+\t\tprogress.done()\n+\t\tendTime = time.time()\n+\t\tself.timeSpent = endTime - startTime\n+\n+\tdef displayResults(self):\n+\t\tprint "# queries: %d" % (self.nbQueries)\n+\t\tprint "# refs:\t %d" % (self.nbRefs)\n+\t\tprint "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)\n+\t\tprint "time:\t %.2gs" % (self.timeSpent)\n+\n+\tdef run(self):\n+\t\tself.compare()\n+\t\tself.displayResults()\n+\n+if __name__ == "__main__":\n+\t\n+\tdescription = "Find Overlaps With Several Intervals Using Bin v1.0.1: Use MySQL binning to compare intervals. [Category: Personal]"\n+\n+\tparser = OptionParser(description = description)\n+\tparser.add_option("-i", "--input1",\t dest="inputFileName1", action="store",\t\t\ttype="string", help="query input file [compulsory] [format: file in transcript format given by -f]")\n+\tparser.add_option("-f", "--format1",\t dest="format1",\t\taction="store",\t\t\ttype="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-j", "--input2",\t dest="inputFileName2", action="store",\t\t\ttype="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")\n+\tparser.add_option("-g", "--format2",\t dest="format2",\t\taction="store",\t\t\ttype="string", help="format of previous file [compulsory] [format: transcript file format]")\n+\tparser.add_option("-o", "--output",\t dest="outputFileName", action="store",\t\t\ttype="string", help="output file [format: output file in GFF3 format]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity",\t action="store", default=1, type="int",\thelp="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tfowsib = FindOverlapsWithSeveralIntervalsBin(options.verbosity)\n+\tfowsib.setQueryFile(options.inputFileName1, options.format1)\n+\tfowsib.setReferenceFile(options.inputFileName2, options.format2)\n+\tfowsib.setOutputFile(options.outputFileName)\n+\tfowsib.run()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervalsIndex.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,137 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import random, os, time, MySQLdb +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +class FindOverlapsWithSeveralIntervalsIndex(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + randomNumber = random.randint(0, 10000) + self.dbName = "smartdb" + if "SMARTTMPPATH" in os.environ: + self.dbName = os.join(os.environ["SMARTTMPPATH"], self.dbName) + self.db = MySQLdb.connect(db = self.dbName) + self.tableName = "table_%s" % (randomNumber) + self.nbQueries = 0 + self.nbRefs = 0 + self.nbOverlaps = 0 + + def __del__(self): + cursor = self.db.cursor() + cursor.execute("DROP TABLE IF EXISTS %s" % (self.tableName)) + + + def setReferenceFile(self, fileName, format): + cursor = self.db.cursor() + cursor.execute("CREATE TABLE %s (start INT, end INT)" % (self.tableName)) + cursor.execute("CREATE INDEX index_%s ON %s (start, end)" % (self.tableName, self.tableName)) + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + progress = UnlimitedProgress(1000, "Reading references", self.verbosity) + for transcript in parser.getIterator(): + start = transcript.getStart() + end = transcript.getEnd() + cursor = self.db.cursor() + cursor.execute("INSERT INTO %s (start, end) VALUES (%d, %d)" % (self.tableName, start, end)) + self.nbRefs += 1 + progress.inc() + self.db.commit() + progress.done() + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + self.nbQueries = self.queryParser.getNbTranscripts() + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def compare(self): + progress = Progress(self.nbQueries, "Reading queries", self.verbosity) + startTime = time.time() + for queryTranscript in self.queryParser.getIterator(): + queryStart = queryTranscript.getStart() + queryEnd = queryTranscript.getEnd() + command = "SELECT 1 FROM %s WHERE start <= %d and end >= %d" % (self.tableName, queryEnd, queryStart) + cursor = self.db.cursor() + cursor.execute(command) + overlap = False + line = cursor.fetchone() + while line: + overlap = True + line = cursor.fetchone() + if overlap: + self.writer.addTranscript(queryTranscript) + self.nbOverlaps += 1 + progress.inc() + progress.done() + endTime = time.time() + self.timeSpent = endTime - startTime + + def displayResults(self): + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# overlaps: %d" % (self.nbOverlaps) + print "time: %.2gs" % (self.timeSpent) + + def run(self): + self.compare() + self.displayResults() + +if __name__ == "__main__": + + description = "Find Overlaps With Several Intervals Using Indices v1.0.1: Use MySQL to compare intervals. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + fowsii = FindOverlapsWithSeveralIntervalsIndex(options.verbosity) + fowsii.setQueryFile(options.inputFileName1, options.format1) + fowsii.setReferenceFile(options.inputFileName2, options.format2) + fowsii.setOutputFile(options.outputFileName) + fowsii.run() + + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/FindOverlaps_naif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/FindOverlaps_naif.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,85 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import os +import struct +from optparse import OptionParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.Gff3Writer import Gff3Writer + +LONGSIZE = struct.calcsize('l') + +class FindOverlaps_naif(object): + + def __init__(self, inputRefGff3FileName, inputQueryGff3FileName): + self._inputRefGff3FileName = inputRefGff3FileName + self._inputQueryGff3FileName = inputQueryGff3FileName + + def close(self): + self._iGff3Writer.close() + + def setGff3FileName(self, fileName): + self._inputRefGff3FileName = fileName + + def setQueryGff3FileName(self, fileName): + self._inputQueryGff3FileName = fileName + + def setOutputGff3FileName(self, outputGff3FileName): + if outputGff3FileName != '': + self._outputGff3FileName = outputGff3FileName + self._iGff3Writer = Gff3Writer(self._outputGff3FileName) + + def run(self): + queryParser = GffParser(self._inputQueryGff3FileName, 0) + for queryTranscript in queryParser.getIterator(): + ids = [] + refParser = GffParser(self._inputRefGff3FileName, 0) + for refTranscript in refParser.getIterator(): + if queryTranscript.overlapWith(refTranscript): + ids.append(refTranscript.getTagValue('ID')) + if ids: + queryTranscript.setTagValue("nbOverlaps", len(ids)) + queryTranscript.setTagValue("overlapsWith", "--".join(ids)) + self._iGff3Writer.addTranscript(queryTranscript) + +if __name__ == "__main__": + description = "FindOverlapsWithSeveralInterval: Finds overlaps with several query intervals." + + parser = OptionParser(description = description) + parser.add_option("-i", "--inputRef", dest="inputRefGff3FileName", action="store", type="string", help="Reference input file [compulsory] [format: file in gff3 format]") + parser.add_option("-j", "--inputQuery", dest="inputQueryGff3FileName", action="store", type="string", help="Query input file [compulsory] [format: file in gff3 format]") + parser.add_option("-o", "--output", dest="outputGff3FileName", action="store", type="string", help="output file [compulsory] [format: output file in gff3 format]") + (options, args) = parser.parse_args() + + iFON = FindOverlaps_naif(options.inputRefGff3FileName, options.inputQueryGff3FileName) + iFON.setOutputGff3FileName(options.outputGff3FileName) + iFON.run() + iFON.close() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCIndex.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCIndex.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,55 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +from SMART.Java.Python.structure.Transcript import Transcript + +class NCIndex(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._step = 10000 + self._indices = [] + + def setStep(self, step): + self._step = step + + def addTranscript(self, end, index): + binStart = len(self._indices) + binEnd = int(end / self._step) + for bin in range(binStart, binEnd+1): + self._indices.append(index) + + def getIndex(self, transcript): + bin = int(transcript.getStart() / self._step) + if bin >= len(self._indices): + return self._indices[-1] + return self._indices[bin] + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,337 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os, os.path\n+import struct\n+import shelve\n+import sys\n+from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle\n+from SMART.Java.Python.ncList.NCIndex import NCIndex\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+LONG_SIZE = struct.calcsize(\'l\')\n+\n+H = 0\n+L = 1\n+T = 2\n+G = 3\n+\n+H_CELL_SIZE = 2\n+L_CELL_SIZE = 5\n+T_CELL_SIZE = 6\n+\n+START = 0\n+END\t = 1\n+ADDRESS = 2\n+LIST\t= 3\n+PARENT = 4\n+NEW\t = 5\n+LENGTH = 1\n+\n+def pack(input):\n+\treturn struct.pack("l", long(input))\n+def unpack(input):\n+\treturn struct.unpack("l", input)[0]\n+\n+\n+class NCList(object):\n+\n+\tdef __init__(self, verbosity):\n+\t\tself._verbosity\t\t = verbosity\n+\t\tself._subPos\t\t\t = 0\n+\t\tself._parentPos\t\t = 0\n+\t\tself._nbLines\t\t\t = 0\n+\t\tself._nbLists\t\t\t = 0\n+\t\tself._chromosome\t\t = None\n+\t\tself._transcriptFileName = None\n+\t\tself._lHandle\t\t\t = None\n+\t\tself._hHandle\t\t\t = None\n+\t\tself._tHandle\t\t\t = None\n+\t\tself._parser\t\t\t = None\n+\t\tself._sizeDict\t\t = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE}\n+\t\tself._offsets\t\t\t = {H: 0, L: 0, G: 0}\n+\t\tself._fileNameDict\t = {}\n+\t\tself._handleDict\t\t = {}\n+\t\tself._createIndex\t\t = False\n+\t\tself._missingValues\t = dict([table, {}] for table in self._sizeDict)\n+\t\tself._missingValues[T][LIST] = -1\n+\t\tself._missingValues[L][LIST] = 0\n+\t\tself._missingValues[T][NEW] = -1\n+\n+\tdef __del__(self):\n+\t\tfor handle in (self._lHandle, self._hHandle):\n+\t\t\tif handle != None:\n+\t\t\t\thandle.close()\n+\n+\tdef createIndex(self, boolean):\n+\t\tself._createIndex = boolean\n+\n+\tdef setChromosome(self, chromosome):\n+\t\tself._chromosome = chromosome\n+\n+\tdef setFileName(self, fileName):\n+\t\tself._transcriptFileName = fileName\n+\t\tself._parser = NCListFileUnpickle(fileName, self._verbosity)\n+\t\tself._setFileNames(fileName)\n+\n+\tdef setNbElements(self, nbElements):\n+\t\tself._nbLines = nbElements\n+\n+\tdef setOffset(self, fileType, offset):\n+\t\tself._offsets[fileType] = offset\n+\n+\tdef _setFileNames(self, fileName):\n+\t\tif self._chromosome != None and fileName != None:\n+\t\t\tcoreName = os.path.splitext(fileName)[0]\n+\t\t\tif "SMARTTMPPATH" in os.environ:\n+\t\t\t\tcoreName = os.path.join(os.environ["SMARTTMPPATH"], coreName)\n+\t\t\tself._hFileName = "%s_H.bin" % (coreName)\n+\t\t\tself._lFileName = "%s_L.bin" % (coreName)\n+\t\t\tself._tFileName = "%s_T.bin" % (coreName)\n+\t\t\tself._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName'..b's, "Filling table T", self._verbosity-5)\n+\t\tfor i, transcript in enumerate(self._parser.getIterator()):\n+\t\t\tself._writeValue(T, i, START, transcript.getStart())\n+\t\t\tself._writeValue(T, i, END,\t transcript.getEnd())\n+\t\t\tself._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress())\n+\t\t\tself._writeValue(T, i, PARENT, -1)\n+\t\t\tself._writeValue(T, i, LIST,\t-1)\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\t\tprogress = Progress(self._nbLists, "Filling table H", self._verbosity-5)\n+\t\tfor i in xrange(self._nbLists):\n+\t\t\tself._writeValue(H, i, LENGTH, 0)\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\n+\tdef _labelLists(self):\n+\t\tprogress = Progress(self._nbLines, "Getting table structure", self._verbosity-5)\n+\t\tnextL = 0\n+\t\tfor i in xrange(self._nbLines):\n+\t\t\tp\t = i - 1\n+\t\t\tstart = self._readValue(T, i, START)\n+\t\t\tend = self._readValue(T, i, END)\n+\t\t\twhile p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)):\n+\t\t\t\tp = self._readValue(T, p, PARENT)\n+\t\t\tthisL = self._readValue(T, p, LIST)\n+\t\t\tif thisL == -1:\n+\t\t\t\t#print "entering"\n+\t\t\t\tthisL = nextL\n+\t\t\t\tnextL += 1\n+\t\t\t\tlength = 0\n+\t\t\t\tself._writeValue(T, p, LIST, thisL)\n+\t\t\telse:\n+\t\t\t\tlength = self._readValue(H, thisL, LENGTH)\n+\t\t\tself._writeValue(T, i,\t PARENT, p)\n+\t\t\tself._writeValue(H, thisL, LENGTH, length + 1)\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\n+\tdef _computeSubStart(self):\n+\t\tprogress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5)\n+\t\ttotal = 0\n+\t\tfor i in xrange(self._nbLists):\n+\t\t\tself._writeValue(H, i, START, total)\n+\t\t\ttotal += self._readValue(H, i, LENGTH)\n+\t\t\tself._writeValue(H, i, LENGTH, 0)\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\n+\tdef _computeAbsPosition(self):\n+\t\tprogress = Progress(self._nbLines, "Writing table", self._verbosity-5)\n+\t\tself._sizeFirstList = 0\n+\t\tfor i in xrange(self._nbLines):\n+\t\t\ts = self._readValue(T, i, START)\n+\t\t\te = self._readValue(T, i, END)\n+\t\t\ta = self._readValue(T, i, ADDRESS)\n+\t\t\tpt = self._readValue(T, i, PARENT)\n+\t\t\th = self._readValue(T, pt, LIST)\n+\t\t\tpl = self._readValue(T, pt, NEW)\n+\t\t\tnb = self._readValue(H, h, LENGTH)\n+\t\t\tl = self._readValue(H, h, START) + nb\n+\t\t\tself._writeValue(T, i, NEW,\t l)\n+\t\t\tself._writeValue(L, l, START, s)\n+\t\t\tself._writeValue(L, l, END,\t e)\n+\t\t\tself._writeValue(L, l, ADDRESS, a)\n+\t\t\tself._writeValue(L, l, LIST,\t-1)\n+\t\t\tself._writeValue(L, l, PARENT, pl)\n+\t\t\tself._writeValue(H, h, LENGTH, nb+1)\n+\t\t\tif nb == 0:\n+\t\t\t\t#print "adding it"\n+\t\t\t\tself._writeValue(L, pl, LIST, h)\n+\t\t\tif pl == -1:\n+\t\t\t\tself._sizeFirstList += 1\n+\t\t\t\tif self._createIndex:\n+\t\t\t\t\tself._index.addTranscript(e, l)\n+\t\t\tprogress.inc()\n+\t\tprogress.done()\n+\n+\tdef closeFiles(self):\n+\t\tfor handle in self._handleDict.values():\n+\t\t\thandle.close()\n+\t\tdel self._handleDict\n+\t\tself._lHandle = None\n+\t\tself._hHandle = None\n+\t\tself._tHandle = None\n+\t\tself._parser = None\n+\n+\tdef openFiles(self):\n+\t\tself._lHandle = open(self._fileNameDict[L], "rb")\n+\t\tself._hHandle = open(self._fileNameDict[H], "rb")\n+\t\tself._handleDict = {H: self._hHandle, L: self._lHandle}\n+\t\tself._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity)\n+\n+\tdef _cleanFiles(self):\n+\t\tself.closeFiles()\n+\t\tos.remove(self._fileNameDict[T])\n+\n+\tdef _getPosition(self, table, line, key):\n+\t\thandle = self._handleDict[table]\n+\t\thandle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE)\n+\t\treturn handle\n+\n+\tdef _writeValue(self, table, line, key, value):\n+\t\t#print "writing", table, line, key, "<-", value\n+\t\tif line == -1:\n+\t\t\tself._missingValues[table][key] = value\n+\t\t\treturn\n+\t\thandle = self._getPosition(table, line, key)\n+\t\thandle.write(pack(value))\n+\n+\tdef _readValue(self, table, line, key):\n+\t\t#print "reading", table, line, key, "->",\n+\t\tif line == -1:\n+\t\t\t#print self._missingValues[table][key]\n+\t\t\treturn self._missingValues[table][key]\n+\t\thandle = self._getPosition(table, line, key)\n+\t\tr = unpack(handle.read(LONG_SIZE))\n+\t\t#print r\n+\t\treturn r\n+\n+\tdef getIndex(self):\n+\t\treturn self._index\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCListCursor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListCursor.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,325 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os, os.path, struct\n+from commons.core.parsing.GffParser import GffParser\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+\n+class Data(object):\n+ def __init__(self, hIndex, transcript, firstChildLIndex, lastChildLIndex, start, end):\n+ self.hIndex = hIndex\n+ self.transcript = transcript\n+ self.firstChildLIndex = firstChildLIndex\n+ self.lastChildLIndex = lastChildLIndex\n+ self.start = start\n+ self.end = end\n+\n+class NCListCursor(object):\n+\n+ def __init__(self, cursor = None, ncList = None, lIndex = 0, verbosity = 0):\n+ self._verbosity = verbosity\n+ self._mainListData = []\n+ if cursor:\n+ self.copy(cursor)\n+ else:\n+ self._ncList = ncList\n+ self.setLIndex(lIndex)\n+\n+ def setLIndex(self, lIndex):\n+ self._lIndex = lIndex\n+ self._start = None\n+ self._end = None\n+ self._hIndex = None\n+ self._gffIndex = None\n+ self._parentGffIndex = None\n+ self._parentLIndex = None\n+ self._parentHIndex = None\n+ self._parentStart = None\n+ self._parentEnd = None\n+ self._transcript = None\n+ self._firstSiblingLIndex = None\n+ self._lastSiblingLIndex = None\n+ self._firstChildLIndex = None\n+ self._lastChildLIndex = None\n+ self._mainListIndex = lIndex if lIndex < self._ncList.getSizeFirstList() else None\n+\n+ def precompute(self):\n+ self._mainListIndex = 0\n+ progress = Progress(self._ncList.getSizeFirstList(), "Precomputing data", self._verbosity)\n+ for i in range(self._ncList.getSizeFirstList()):\n+ gffIndex, hIndex, parentLIndex, start, end = self._ncList.getLLineElements(i)\n+ transcript = self._ncList.getIntervalFromAdress(gffIndex)\n+ firstChildLIndex, nbChildren = self._ncList.getHLineElements(hIndex)\n+ lastChildLIndex = -1 if firstChildLIndex == -1 else firstChildLIndex + nbChildren-1\n+ self._mainListData.append(Data(hIndex, transcript, firstChildLIndex, lastChildLIndex, start, end))\n+ progress.inc()\n+ progress.done()\n+\n+ def _updateFromMainListData(self):\n+ if not self._mainListData or self._lIndex >= self._ncList.getSizeFirstList():\n+ #p'..b'+ self._gffIndex = None\n+ self._transcript = None\n+ self._firstChildLIndex = None\n+ self._lastChildLIndex = None\n+\n+ def moveSibling(self, lIndex):\n+ if self._lIndex < self._ncList.getSizeFirstList() - 1:\n+ self._mainListIndex = lIndex\n+ self._updateFromMainListData()\n+ self._lIndex = lIndex\n+ self._hIndex = None\n+ self._start = None\n+ self._end = None\n+ self._gffIndex = None\n+ self._transcript = None\n+ self._firstChildLIndex = None\n+ self._lastChildLIndex = None\n+\n+ def moveLastSibling(self):\n+ if self._lIndex < self._ncList.getSizeFirstList() - 1:\n+ self._mainListIndex = self._ncList.getSizeFirstList() - 1\n+ self._updateFromMainListData()\n+ if self._lastSiblingLIndex == None:\n+ self._getSiblingData()\n+ self._lIndex = self._lastSiblingLIndex\n+ self._hIndex = None\n+ self._start = None\n+ self._end = None\n+ self._gffIndex = None\n+ self._transcript = None\n+ self._firstChildLIndex = None\n+ self._lastChildLIndex = None\n+\n+ def moveDown(self):\n+ if self._firstChildLIndex == None:\n+ self._getChildrenData()\n+ self._parentLIndex = self._lIndex\n+ self._parentHIndex = self._hIndex\n+ self._parentGffIndex = self._gffIndex\n+ self._lIndex = self._firstChildLIndex\n+ self._lastSiblingLIndex = self._lastChildLIndex\n+ self._hIndex = None\n+ self._gffIndex = None\n+ self._transcript = None\n+ self._firstChildLIndex = None\n+ self._lastChildLIndex = None\n+ self._parentStart = self._start\n+ self._parentEnd = self._end\n+ self._start = None\n+ self._end = None\n+\n+ def isOut(self):\n+ return (self._lIndex == -1)\n+\n+ def isTop(self):\n+ if self._parentLIndex == None:\n+ self._getCurrentData()\n+ return (self._parentLIndex == -1)\n+\n+ def hasChildren(self):\n+ if self._hIndex == None:\n+ self._getCurrentData()\n+ if self._hIndex == -1:\n+ return False\n+ if self._firstChildLIndex == None:\n+ self._getChildrenData()\n+ return (self._firstChildLIndex != -1)\n+\n+ def copy(self, cursor):\n+ self._ncList = cursor._ncList\n+ self._lIndex = cursor._lIndex\n+ self._hIndex = cursor._hIndex\n+ self._gffIndex = cursor._gffIndex\n+ self._parentLIndex = cursor._parentLIndex\n+ self._parentHIndex = cursor._parentHIndex\n+ self._parentGffIndex = cursor._parentGffIndex\n+ self._transcript = cursor._transcript\n+ self._firstSiblingLIndex = cursor._firstSiblingLIndex\n+ self._lastSiblingLIndex = cursor._lastSiblingLIndex\n+ self._firstChildLIndex = cursor._firstChildLIndex\n+ self._lastChildLIndex = cursor._lastChildLIndex\n+ self._mainListData = cursor._mainListData\n+ self._mainListIndex = cursor._mainListIndex\n+ self._verbosity = cursor._verbosity\n+ self._parentStart = cursor._parentStart\n+ self._parentEnd = cursor._parentEnd\n+ self._start = cursor._start\n+ self._end = cursor._end\n+\n+ def __str__(self):\n+ return "NC-list: %s, Lindex: %s, Hindex: %s, GFFindex: %s, start: %s, end: %s, parent Lindex: %s, parent Hindex: %s, parent GFFindex: %s, transcript: %s, last sibling: %s" % (self._ncList, self._lIndex, self._hIndex, self._gffIndex, self._start, self._end, self._parentLIndex, self._parentHIndex, self._parentGffIndex, self._transcript, self._lastSiblingLIndex)\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCListFilePickle.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListFilePickle.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,123 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.structure.Transcript import Transcript + + +class NCListFilePickle(object): + + def __init__(self, fileName, verbosity = 1): + self.fileName = fileName + self.handle = open(fileName, "wb") + self.verbosity = verbosity + + def __del__(self): + if self.handle != None: + self.handle.close() + + def addTranscript(self, transcript): + pickle.dump(transcript, self.handle, -1) + + def write(self): + pass + + def close(self): + self.__del__() + + +class NCListFileUnpickle(object): + + def __init__(self, fileName, verbosity = 1): + self.handle = open(fileName, "rb") + self.verbosity = verbosity + self.initAddress = 0 + self.address = self.initAddress + self.nbTranscripts = None + self.fileName = fileName + self.over = False + self.chromosome = None + + def __del__(self): + if self.handle != None: + self.handle.close() + + def reset(self): + self.handle.seek(0) + self.initAddress = 0 + + def setChromosome(self, chromosome): + self.chromosome = chromosome + + def getNbTranscripts(self): + if self.nbTranscripts != None: + return self._nbTranscripts + self.nbTranscripts = 0 + for transcript in self.getIterator(): + self_nbTranscripts += 1 + return self.nbTranscripts + + def gotoAddress(self, address): + self.handle.seek(address) + self.address = address + + def getNextTranscript(self): + self.address = self.handle.tell() + try: + transcript = pickle.load(self.handle) + if self.chromosome != None and transcript.getChromosome() != self.chromosome: + self.over = True + return False + return transcript + except EOFError: + self.over = True + return False + + def getIterator(self): + self.gotoAddress(self.initAddress) + while True: + transcript = self.getNextTranscript() + if not transcript: + self.over = True + return + yield transcript + + def setInitAddress(self, address): + self.initAddress = address + + def getCurrentTranscriptAddress(self): + return self.address + + def isOver(self): + return self.over |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCListHandler.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListHandler.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,125 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCIndex import NCIndex +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle + +LONG_SIZE = struct.calcsize('l') + +INFO_PER_NCLIST = 5 +H_FILE = 0 +L_FILE = 1 +G_FILE = 2 +FIRST_LIST_SIZE = 3 +INDEX = 4 + +H = 0 +L = 1 +T = 2 +G = 3 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCListHandler(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._index = False + + def setFileName(self, fileName): + self._fileName = fileName + self._handle = open(fileName, "rb") + + def loadData(self): + self._chromosomes = pickle.load(self._handle) + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._ncLists = {} + for chromosome in self._chromosomes: + self._nbElementsPerChromosome[chromosome] = unpack(self._handle.read(LONG_SIZE)) + self._nbElements += self._nbElementsPerChromosome[chromosome] + self._headerPos = self._handle.tell() + for i, chromosome in enumerate(self._chromosomes): + ncList = NCList(self._verbosity) + ncList._hHandle = self._handle + ncList._lHandle = self._handle + ncList._parser = NCListFileUnpickle(self._fileName) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + H_FILE * LONG_SIZE) + ncList.setOffset(H, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + L_FILE * LONG_SIZE) + ncList.setOffset(L, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + G_FILE * LONG_SIZE) + ncList.setOffset(G, unpack(self._handle.read(LONG_SIZE))) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + FIRST_LIST_SIZE * LONG_SIZE) + ncList._sizeFirstList = unpack(self._handle.read(LONG_SIZE)) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + INDEX * LONG_SIZE) + indices = unpack(self._handle.read(LONG_SIZE)) + if indices != -1: + self._handle.seek(indices) + data = pickle.load(self._handle) + index = NCIndex(self._verbosity) + index._indices = data + ncList._index = index + self._ncLists[chromosome] = ncList + + def getChromosomes(self): + return self._chromosomes + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def getParser(self, chromosome = None): + parser = NCListFileUnpickle(self._fileName) + if chromosome == None: + parser.setInitAddress(unpack(self._handle, self._headerPos + G_FILE * LONG_SIZE)) + return parser + i = self._chromosomes.index(chromosome) + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + G_FILE * LONG_SIZE) + pos = unpack(self._handle.read(LONG_SIZE)) + parser.setInitAddress(pos) + parser.setChromosome(chromosome) + return parser |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCListMerger.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListMerger.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,126 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import struct, os, shutil +try: + import cPickle as pickle +except: + import pickle + +LONG_SIZE = struct.calcsize('l') + +INFO_PER_NCLIST = 5 +H_FILE = 0 +L_FILE = 1 +G_FILE = 2 +FIRST_LIST_SIZE = 3 +INDEX = 4 + +def pack(input): + return struct.pack("l", long(input)) +def unpack(input): + return struct.unpack("l", input)[0] + + +class NCListMerger(object): + + def __init__(self, verbosity): + self._verbosity = verbosity + self._index = False + + def setFileName(self, fileName): + self._handle = open(fileName, "wb") + + def setNCLists(self, ncLists): + self._ncLists = ncLists + self._chromosomes = sorted(self._ncLists.keys()) + + def addIndex(self, boolean): + self._index = boolean + + def merge(self): + self._writeHeader() + self._addNCLists() + self._handle.close() + self._removeInputFiles() + + def _writeHeader(self): + pickle.dump(self._chromosomes, self._handle, -1) + for chromosome in self._chromosomes: + self._handle.write(pack(self._ncLists[chromosome]._nbLines)) + self._headerPos = self._handle.tell() + for chromosome in self._chromosomes: + for i in range(INFO_PER_NCLIST): + self._handle.write(pack(-1)) + + def _addInHeader(self, i, info, value = None): + currentPos = self._handle.tell() + if value == None: + value = currentPos + self._handle.seek(self._headerPos + i * INFO_PER_NCLIST * LONG_SIZE + info * LONG_SIZE) + self._handle.write(pack(value)) + self._handle.seek(currentPos) + + def _addNCLists(self): + self._inputFileNames = [] + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, H_FILE) + hFile = open(ncList._hFileName) + shutil.copyfileobj(hFile, self._handle) + hFile.close() + self._inputFileNames.append(ncList._hFileName) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, L_FILE) + lFile = open(ncList._lFileName) + shutil.copyfileobj(lFile, self._handle) + lFile.close() + self._inputFileNames.append(ncList._lFileName) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, FIRST_LIST_SIZE, ncList.getSizeFirstList()) + if self._index: + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, INDEX) + pickle.dump(ncList.getIndex()._indices, self._handle, -1) + for i, chromosome in enumerate(self._chromosomes): + ncList = self._ncLists[chromosome] + self._addInHeader(i, G_FILE) + tFile = open(ncList._transcriptFileName) + shutil.copyfileobj(tFile, self._handle) + tFile.close() + self._inputFileNames.append(ncList._transcriptFileName) + + def _removeInputFiles(self): + for fileName in self._inputFileNames: + os.remove(fileName) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/ncList/NCListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/ncList/NCListParser.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,74 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +import random, os, time +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +try: + import cPickle as pickle +except: + import pickle + +class NCListParser(object): + + def __init__(self, fileName, verbosity = 1): + self._fileName = fileName + self._ncLists = {} + self._sortedFileNames = {} + self._nbElements = 0 + self._nbElementsPerChromosome = {} + self._verbosity = verbosity + + def parse(self): + handle = open(self._fileName) + self._sortedFileNames = pickle.load(handle) + self._nbElements = pickle.load(handle) + self._nbElementsPerChromosome = pickle.load(handle) + self._ncLists = pickle.load(handle) + for ncList in self._ncLists.values(): + ncList._reopenFiles() + handle.close() + + def getSortedFileNames(self): + return self._sortedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/plotCoverage.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotCoverage.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,481 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os, os.path, subprocess, glob, random\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+from SMART.Java.Python.misc.Progress import Progress\n+from commons.core.parsing.FastaParser import FastaParser\n+\n+strands = [-1, 1]\n+colors = {-1: "blue", 1: "red", 0: "black"}\n+colorLine = "black"\n+\n+def parseTargetField(field):\n+\tstrand = "+"\n+\tsplittedFieldSpace = field.split()\n+\tsplittedFieldPlus = field.split("+", 4)\n+\tif len(splittedFieldSpace) == 3:\n+\t\tid, start, end = splittedFieldSpace\n+\telif len(splittedFieldSpace) == 4:\n+\t\tid, start, end, strand = splittedFieldSpace\n+\telif len(splittedFieldPlus) == 3:\n+\t\tid, start, end = splittedFieldPlus\n+\telif len(splittedFieldPlus) == 4:\n+\t\tid, start, end, strand = splittedFieldPlus\n+\telse:\n+\t\traise Exception("Cannot parse Target field \'%s\'." % (field))\n+\treturn (id, int(start), int(end), strand)\n+\n+\n+class SimpleTranscript(object):\n+\tdef __init__(self, transcript1, transcript2, color = None):\n+\t\tself.start = max(0, transcript1.getStart() - transcript2.getStart())\n+\t\tself.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart())\n+\t\tself.strand = transcript1.getDirection() * transcript2.getDirection()\n+\t\tself.exons = []\n+\t\tfor exon in transcript1.getExons():\n+\t\t\tif exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd():\n+\t\t\t\tstart = max(0, exon.getStart() - transcript2.getStart())\n+\t\t\t\tend = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart())\n+\t\t\t\tself.addExon(start, end, self.strand, color)\n+\n+\tdef addExon(self, start, end, strand, color):\n+\t\texon = SimpleExon(start, end, strand, color)\n+\t\tself.exons.append(exon)\n+\n+\tdef getRScript(self, yOffset, height):\n+\t\trString = ""\n+\t\tpreviousEnd = None\n+\t\tfor exon in sorted(self.exons, key=lambda exon: exon.start):\n+\t\t\tif previousEnd != None:\n+\t\t\t\trString += "segments(%.1f, %.1f, %.1f, %.1f, col = \\"%s\\")\\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine)\n+\t\t\trString += exon.getRScript(yOffset, height)\n+\t\t\tpreviousEnd = exon.end\n+\t\treturn rString\n+\n+\n+class SimpleExon(object):\n+\tdef __init__(self, start, end, strand, color = None):\n'..b'on="store", type="string", help="input file 1 [compulsory] [format: file in transcript or mapping format given by -f]")\n+\tparser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript or mapping file format]")\n+\tparser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")\n+\tparser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]")\n+\tparser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]")\n+\tparser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]")\n+\tparser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]")\n+\tparser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]")\n+\tparser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]")\n+\tparser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]")\n+\tparser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]")\n+\tparser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]")\n+\tparser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]")\n+\tparser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]")\n+\tparser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]")\n+\tparser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]")\n+\tparser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]")\n+\tparser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")\n+\t(options, args) = parser.parse_args()\n+\n+\tcolors[1] = options.plusColor\n+\tcolors[-1] = options.minusColor\n+\tcolors[0] = options.sumColor\n+\tcolorLine = options.lineColor\n+\n+\tpp = PlotParser(options.verbosity)\n+\tpp.addInput(0, options.inputFileName1, options.inputFormat1)\n+\tpp.addInput(1, options.inputFileName2, options.inputFormat2)\n+\tpp.addSequence(options.inputSequence)\n+\tpp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dir, options.outputFileName))\n+\tpp.setPlotSize(options.width, options.height)\n+\tpp.setLabels(options.xLabel, options.yLabel)\n+\tpp.setTitle(options.title)\n+\tpp.setMerge(options.merge)\n+\tpp.start()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/plotRepartition.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotRepartition.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,128 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Plot the data from the data files +""" +import os +from optparse import OptionParser +from commons.core.parsing.GffParser import GffParser +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress + + +if __name__ == "__main__": + + # parse command line + description = "Plot Repartition v1.0.1: Plot the repartition of different data on a whole genome. (This tool uses 1 input file only, the different values being stored in the tags. See documentation to know more about it.) [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF3 format]") + parser.add_option("-n", "--names", dest="names", action="store", default=None, type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-c", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", default="png", type="string", help="format of the output file [format: string] [default: png]") + parser.add_option("-r", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-l", "--log", dest="log", action="store", default="", type="string", help="use log on x- or y-axis (write 'x', 'y' or 'xy') [format: string]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + (options, args) = parser.parse_args() + + strands = [1, -1] + strandToString = {1: "+", -1: "-"} + names = [None] if options.names == None else options.names.split(",") + maxs = {} + nbElements = [0 for name in names] + lines = [{} for i in range(len(names))] + if options.colors == None: + colors = [None for i in range(len(names))] + else: + colors = options.colors.split(",") + + parser = GffParser(options.inputFileName, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Reading %s" % (options.inputFileName), options.verbosity) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + direction = transcript.getDirection() + start = transcript.getStart() + for i, name in enumerate(names): + if chromosome not in lines[i]: + lines[i][chromosome] = dict([(strand, {}) for strand in strands]) + if chromosome not in maxs: + maxs[chromosome] = transcript.getStart() + else: + maxs[chromosome] = max(maxs[chromosome], start) + if start not in lines[i][chromosome][direction]: + lines[i][chromosome][direction][start] = 0 + thisNbElements = float(transcript.getTagValue(name)) if name != None and name in transcript.getTagNames() else 1 + lines[i][chromosome][direction][start] += thisNbElements * direction + nbElements[i] += thisNbElements + progress.inc() + progress.done() + + if options.normalize: + if options.verbosity >= 10: + print "Normalizing..." + for i, linesPerCondition in enumerate(lines): + for linesPerChromosome in linesPerCondition.values(): + for line in linesPerChromosome.values(): + for key, value in line.iteritems(): + line[key] = value / float(nbElements[i]) * max(nbElements) + if options.verbosity >= 10: + print "... done." + + progress = Progress(len(maxs.keys()), "Plotting", options.verbosity) + for chromosome in maxs: + plot = RPlotter("%s%s.%s" % (options.outputFileName, chromosome.capitalize(), options.format), options.verbosity) + plot.setLog(options.log) + plot.setImageSize(2000, 500) + plot.setFormat(options.format) + if maxs[chromosome] <= 1000: + unit = "nt." + ratio = 1.0 + elif maxs[chromosome] <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plot.setYLabel("# reads") + plot.setLegend(True) + for i, name in enumerate(names): + for strand in strands: + correctedLine = dict([(key / ratio, value) for key, value in lines[i][chromosome][strand].iteritems()]) + if name != None: + name = "%s (%s)" % (name.replace("_", " "), strandToString[strand]) + plot.addLine(correctedLine, None, colors[i]) + plot.plot() + progress.inc() + progress.done() + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/plotTranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/plotTranscriptList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,255 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+"""\n+Plot the data from the data files\n+"""\n+import sys\n+import math\n+from optparse import OptionParser\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.misc.RPlotter import RPlotter\n+\n+\n+class PlotTranscriptList(object):\n+\n+ def __init__(self, verbosity = 0):\n+ self.inputFileName = None\n+ self.format = None\n+ self.x = None\n+ self.y = None\n+ self.z = None\n+ self.xDefault = None\n+ self.yDefault = None\n+ self.zDefault = None\n+ self.xLabel = None\n+ self.yLabel = None\n+ self.shape = None\n+ self.bucket = None\n+ self.keep = None\n+ self.log = None\n+ self.verbosity = None\n+\n+\n+ def setPlotter(self, outputFileName, keep, log, xLabel, yLabel):\n+ self.plotter = RPlotter(outputFileName, self.verbosity, keep)\n+ if self.shape != "barplot":\n+ self.plotter.setLog(log)\n+ self.plotter.setXLabel(xLabel)\n+ self.plotter.setYLabel(yLabel)\n+\n+\n+ def setShape(self, shape):\n+ if self.shape == "line":\n+ pass\n+ elif shape == "barplot":\n+ self.plotter.setBarplot(True)\n+ elif shape == "points":\n+ self.plotter.setPoints(True)\n+ elif shape == "heatPoints":\n+ self.plotter.setHeatPoints(True)\n+ else:\n+ sys.exit("Do not understand shape \'%s\'" % (shape))\n+\n+\n+ def setInput(self, inputFileName, format):\n+ self.parser = TranscriptContainer(inputFileName, format, self.verbosity)\n+\n+\n+ def getValues(self, transcript):\n+ x, y, z = None, None, None\n+ x = transcript.getTagValue(self.x)\n+ if self.y != None:\n+ y = transcript.getTagValue(self.y)\n+ if self.z != None:\n+ z = transcript.getTagValue(self.z)\n+ if x == None:\n+ if self.xDefault != None:\n+ x = self.xDefault\n+ else:\n+ sys.exit("Error! Transcript %s do not have the x-tag %s" % (transcript, self.x))\n+ if y == None and self.shape != "line" and self.shape != "barplot":\n+ if self.yDefault != None:\n+ y = s'..b'line = self.clusterInBarplot(line)\n+\n+ if self.shape == "points" or self.shape == "barplot" or self.shape == "line":\n+ self.plotter.addLine(line)\n+ elif self.shape == "heatPoints":\n+ self.plotter.addLine(line)\n+ self.plotter.addHeatLine(heatLine)\n+ else:\n+ sys.exit("Do not understand shape \'%s\'" % (self.shape))\n+\n+ self.plotter.plot()\n+\n+ if self.shape == "points" or self.shape == "heatPoints":\n+ self.getSpearmanRho()\n+\n+\n+\n+if __name__ == "__main__":\n+ \n+ # parse command line\n+ description = "Plot v1.0.2: Plot some information from a list of transcripts. [Category: Visualization]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input",dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")\n+ parser.add_option("-f", "--format",dest="format", action="store",type="string", help="format of the input [compulsory] [format: transcript file format]")\n+ parser.add_option("-x", "--x",dest="x",action="store", type="string", help="tag for the x value [format: string]")\n+ parser.add_option("-y", "--y",dest="y",action="store", type="string", help="tag for the y value [format: string]")\n+ parser.add_option("-z", "--z",dest="z", action="store", default=None,type="string", help="tag for the z value [format: string]")\n+ parser.add_option("-X", "--xDefault",dest="xDefault",action="store", default=None,type="float",help="value for x when tag is not present [format: float]")\n+ parser.add_option("-Y", "--yDefault",dest="yDefault",action="store",default=None,type="float",help="value for y when tag is not present [format: float]")\n+ parser.add_option("-Z", "--zDefault",dest="zDefault", action="store",default=None,type="float",help="value for z when tag is not present [format: float]")\n+ parser.add_option("-n", "--xLabel",dest="xLabel",action="store",default="",type="string", help="label on the x-axis [format: string] [default: ]")\n+ parser.add_option("-m", "--yLabel",dest="yLabel",action="store",default="", type="string", help="label on the y-axis [format: string] [default: ]")\n+ parser.add_option("-o", "--output",dest="outputFileName",action="store",type="string", help="output file names [format: output file in PNG format]")\n+ parser.add_option("-s", "--shape",dest="shape",action="store", type="string", help="shape of the plot [format: choice (barplot, line, points, heatPoints)]")\n+ parser.add_option("-b", "--bucket",dest="bucket",action="store",default=None,type="float",help="bucket size (for the line plot) [format: int] [default: 1]")\n+ parser.add_option("-k", "--keep",dest="keep",action="store_true", default=False, help="keep temporary files [format: bool]")\n+ parser.add_option("-l", "--log",dest="log",action="store",default="",type="string", help="use log on x- or y-axis (write \'x\', \'y\' or \'xy\') [format: string] [default: ]")\n+ parser.add_option("-v", "--verbosity",dest="verbosity",action="store",default=1, type="int",help="trace level [format: int]")\n+ (options, args) = parser.parse_args()\n+\n+ plotTranscriptList = PlotTranscriptList(options.verbosity)\n+ plotTranscriptList.x = options.x\n+ plotTranscriptList.y = options.y\n+ plotTranscriptList.z = options.z\n+ plotTranscriptList.xDefault = options.xDefault\n+ plotTranscriptList.yDefault = options.yDefault\n+ plotTranscriptList.zDefault = options.zDefault\n+ plotTranscriptList.shape = options.shape\n+ plotTranscriptList.bucket = options.bucket\n+ plotTranscriptList.log = options.log\n+ plotTranscriptList.setPlotter(options.outputFileName, options.keep, options.log, options.xLabel, options.yLabel)\n+ plotTranscriptList.setShape(options.shape)\n+ plotTranscriptList.setInput(options.inputFileName, options.format)\n+ plotTranscriptList.run()\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/removeExonLines.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/removeExonLines.sh Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,2 @@ +#!/bin/bash +sed '/exon/d' $1 |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/restrictFromSize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictFromSize.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,94 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the size distribution of a Fasta / BED file""" + +import os +from optparse import OptionParser +from commons.core.parsing.FastaParser import * +from commons.core.parsing.FastqParser import * +from SMART.Java.Python.structure.TranscriptContainer import * +from commons.core.writer.TranscriptWriter import * +from commons.core.writer.FastaWriter import * +from commons.core.writer.FastqWriter import * +from SMART.Java.Python.misc.Progress import * +from SMART.Java.Python.misc.RPlotter import * + + +if __name__ == "__main__": + + # parse command line + description = "Restrict from Size v1.0.1: Select the elements of a list of sequences or transcripts with a given size. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript or sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: sequence or transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript or sequence format given by -f]") + parser.add_option("-m", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size [format: int]") + parser.add_option("-M", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + (options, args) = parser.parse_args() + + if options.format == "fasta": + parser = FastaParser(options.inputFileName, options.verbosity) + writer = FastaWriter(options.outputFileName, options.verbosity) + elif options.format == "fastq": + parser = FastqParser(options.inputFileName, options.verbosity) + writer = FastqWriter(options.outputFileName, options.verbosity) + else: + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = TranscriptWriter(options.outputFileName, options.format, options.verbosity) + + + # treat items + nbItems = parser.getNbItems() + progress = Progress(nbItems, "Analyzing sequences of %s" % (options.inputFileName), options.verbosity) + nbKept = 0 + nbRead = 0 + nbClKept = 0 + nbClRead = 0 + for item in parser.getIterator(): + size = item.getSize() + nb = 1 if options.format in ("fasta", "fastq") or "nbElements" not in item.getTagNames() else float(item.getTagValue("nbElements")) + nbRead += nb + nbClRead += 1 + if (options.minSize == None or options.minSize <= size) and (options.maxSize == None or options.maxSize >= size): + writer.addElement(item) + nbKept += nb + nbClKept += 1 + progress.inc() + progress.done() + + writer.write() + + print "%d items, %d kept (%.2f%%)" % (nbRead, nbKept, 0 if nbItems == 0 else float(nbKept) / nbItems * 100) + if nbKept != nbClKept or nbRead != nbClRead: + print "%d clusters, %d kept (%.2f%%)" % (nbClRead, nbClKept, 0 if nbClRead == 0 else float(nbClKept) / nbClRead * 100) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/restrictSequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictSequenceList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,113 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a sequence list with some names""" + +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.WriterChooser import WriterChooser +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc import Utils + +class RestrictSequenceList(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.exclude = False + + def setInputFileName(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.parser = chooser.getParser(fileName) + + def setExclusion(self, boolean): + self.exclude = boolean + + def setOutputFileName(self, fileName, format): + chooser = WriterChooser(self.verbosity) + chooser.findFormat(format) + self.writer = chooser.getWriter(fileName) + + def setNamesFileName(self, fileName): + self.namesFileName = fileName + + def _readNames(self): + self.names = [] + handle = open(self.namesFileName) + for name in handle: + self.names.append(name.strip()) + handle.close() + + def _write(self): + nbElements = self.parser.getNbItems() + progress = Progress(nbElements, "Parsing input file", self.verbosity) + nbRead = 0 + nbWritten = 0 + for element in self.parser.getIterator(): + name = element.getName() + nbRead += 1 + if Utils.xor(name in self.names, self.exclude): + self.writer.addElement(element) + nbWritten += 1 + if name in self.names: + self.names.remove(name) + progress.inc() + progress.done() + if self.verbosity > 0: + print "%d read" % (nbRead) + print "%d written (%d%%)" % (nbWritten, 0 if nbRead == 0 else round(float(nbWritten) / nbRead * 100)) + + def run(self): + self._readNames() + self._write() + if self.names: + print "Some names are not present in the file: %s" % ", ".join(self.names) + + + +if __name__ == "__main__": + + description = "Restrict Sequence List v1.0.1: Keep the elements of a list of sequences whose name is mentionned in a given file. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFile", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default="fasta", type="string", help="format of the input and output files [compulsory] [format: sequence file format] [default: fasta]") + parser.add_option("-n", "--name", dest="names", action="store", type="string", help="names of the transcripts [compulsory] [format: file in TXT format]") + parser.add_option("-o", "--output", dest="outputFile", action="store", type="string", help="output file [format: output file in sequence format given by -f]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="output all those whose name is NOT on the list [format: boolean]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + rsl = RestrictSequenceList(options.verbosity) + rsl.setInputFileName(options.inputFile, options.format) + rsl.setOutputFileName(options.outputFile, options.format) + rsl.setNamesFileName(options.names) + rsl.setExclusion(options.exclude) + rsl.run() |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/restrictTranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/restrictTranscriptList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,85 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Restrict a transcript list with some parameters (regions)""" + +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress + +STRAND2DIRECTION = {"+": 1, "-": -1, None: None} + +if __name__ == "__main__": + + # parse command line + description = "Restrict Transcript List v1.0.2: Keep the coordinates which are located in a given position. [Category: Data Selection]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format [compulsory] [format: transcript file format]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end [format: int]") + parser.add_option("-t", "--strand", dest="strand", action="store", default=None, type="string", help="strand (+ or -) [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + writer = TranscriptWriter(options.outputFileName, options.format, options.verbosity) + + direction = STRAND2DIRECTION[options.strand] + + nbTranscripts = parser.getNbTranscripts() + progress = Progress(nbTranscripts, "Parsing file %s" % (options.inputFileName), options.verbosity) + + nbTotal = 0 + nbKept = 0 + for transcript in parser.getIterator(): + progress.inc() + nbTotal += 1 + if options.chromosome != None and options.chromosome != transcript.getChromosome(): + continue + if options.start != None and options.start > transcript.getEnd(): + continue + if options.end != None and options.end < transcript.getStart(): + continue + if options.end != None and options.end < transcript.getStart(): + continue + if direction != None and direction != transcript.getDirection(): + continue + nbKept += 1 + writer.addTranscript(transcript) + progress.done() + + writer.write() + + print "%d out of %d are kept (%f%%)" % (nbKept, nbTotal, (float(nbKept) / nbTotal * 100)) |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/Bins.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Bins.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,77 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Some functions about bins +""" + +def getMinBin(): + return 3 + + +def getMaxBin(): + return 7 + + +def getBin(start, end): + for i in range(getMinBin(), getMaxBin() + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (getMaxBin() + 1) + int(start / binLevel)) + return int((getMaxBin() + 1) * 10 ** (getMaxBin() + 1)) + + +def getOverlappingBins(start, end): + array = [] + bigBin = int((getMaxBin() + 1) * 10 ** (getMaxBin() + 1)) + for i in range(getMinBin(), getMaxBin() + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (getMaxBin() + 1) + int(start / binLevel)), int(i * 10 ** (getMaxBin() + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +def getIterator(maxValue = None): + if maxValue == None: + maxValue = 10 ** (getMaxBin() + getMinBin()) - 1 + for i in range(getMinBin(), getMaxBin() + 1): + binLevel = 10 ** i + binBit = i * 10 ** (getMaxBin() + 1) + for j in range(0, maxValue / binLevel+1): + yield binBit + j + yield int((getMaxBin() + 1) * 10 ** (getMaxBin() + 1)) + + +def getNbBins(maxValue = None): + if maxValue == None: + maxValue = 10 ** (getMaxBin() + getMinBin()) - 1 + nbBins = 0 + for i in range(getMinBin(), getMaxBin() + 1): + nbBins += maxValue / 10 ** i + return nbBins + 1 |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/Interval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Interval.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,706 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+\n+from SMART.Java.Python.structure.Bins import *\n+from commons.core.coord.Range import Range\n+\n+class Interval(Range):\n+ """\n+ Store a genomic interval\n+ @ivar name: name of the interval [optional]\n+ @type name: string\n+ @ivar id: id of the interval [optional]\n+ @type id: int\n+ @ivar bin: bin in which the interval should be if stored in a database [computed]\n+ @type bin: int \n+ @ival tags: information about the transcript [optional]\n+ @type tags: dict\n+ @ivar verbosity: verbosity\n+ @type verbosity: int [default: 0]\n+ """\n+\n+ def __init__(self, interval = None, verbosity = 0):\n+ """\n+ Constructor\n+ @param interval: interval to be copied\n+ @type interval: class L{Interval<Interval>}\n+ @param verbosity: verbosity\n+ @type verbosity: int\n+ """\n+ Range.__init__(self)\n+ self.name = None\n+ self.id = None\n+ self.bin = None\n+ self.verbosity = verbosity\n+ self.tags = {}\n+ if interval != None:\n+ self.copy(interval)\n+\n+ #!!!! Warning: two methods getStart() and getEnd() give the information maximum and minimum in interval.!!!!#\n+ #In case strand = "+", start < end; strand = "-", start > end \n+ def getStart(self):\n+ if self.start == -1:\n+ return -1\n+ if self.end == -1:\n+ return self.start\n+ return self.getMin()\n+\n+ \n+ def getEnd(self):\n+ if self.end == -1:\n+ return -1\n+ if self.start == -1:\n+ return self.end\n+ return self.getMax()\n+\n+\n+ def getChromosome(self):\n+ return self.getSeqname()\n+\n+\n+ def getDirection(self):\n+ return 1 if self.getStrand() == "+" else -1\n+\n+\n+ def getName(self):\n+ return self.name\n+\n+\n+ def isSet(self):\n+ """\n+ Check if the interval is set\n+ """\n+ return self.getStart() == None and self.getEnd() == None\n+\n+\n+ def copy(self, interval):\n+ """\n+ Copy method\n+ @param interval: interval to be copied\n+ @type interval: class L{Interval<Interval>}\n+ """\n+ self.setStart(interval.getStart())\n+ self.setEnd(interval.getEnd())\n+ self.setChromosome(interval.getChromosome())\n+ self.setDirection(interval.getDirection()'..b'+ variables = ["name", "chromosome", "start", "end", "direction", "tags", "bin"]\n+ return variables\n+ getSqlVariables = classmethod(getSqlVariables)\n+\n+\n+ def setSqlValues(self, array):\n+ """\n+ Set the values of the properties of this object as given by a results line of a SQL query\n+ """\n+ self.id = array[0]\n+ self.name = array[1].strip("\'")\n+ self.setChromosome(array[2].strip("\'"))\n+ self.setStart(array[3])\n+ self.setEnd(array[4])\n+ self.setDirection(array[5])\n+ self.setTagValues(array[6].strip("\'"), ";", "=")\n+ self.bin = array[7]\n+\n+\n+ def getSqlValues(self):\n+ """\n+ Get the values of the properties that should be saved in a database\n+ """\n+ values = dict()\n+ values["name"] = self.name\n+ values["chromosome"] = self.getChromosome()\n+ values["start"] = self.getStart()\n+ values["end"] = self.getEnd()\n+ values["direction"] = self.getDirection()\n+ values["tags"] = self.getTagValues(";", "=")\n+ values["bin"] = self.getBin()\n+ return values\n+\n+\n+ def getSqlTypes(cls):\n+ """\n+ Get the values of the properties that should be saved in a database\n+ """\n+ types = dict()\n+ types["name"] = "varchar"\n+ types["chromosome"] = "varchar"\n+ types["start"] = "int"\n+ types["end"] = "int"\n+ types["direction"] = "tinyint"\n+ types["tags"] = "varchar"\n+ types["bin"] = "int"\n+ return types\n+ getSqlTypes = classmethod(getSqlTypes)\n+ \n+\n+ def getSqlSizes(cls):\n+ """\n+ Get the sizes of the properties that should be saved in a database\n+ """\n+ sizes = dict()\n+ sizes["name"] = 255\n+ sizes["chromosome"] = 255\n+ sizes["start"] = 11\n+ sizes["end"] = 11\n+ sizes["direction"] = 4\n+ sizes["tags"] = 1023\n+ sizes["bin"] = 11\n+ return sizes\n+ getSqlSizes = classmethod(getSqlSizes)\n+ \n+\n+ def printCoordinates(self):\n+ """\n+ Print start and end positions (depending on the direction of the interval)\n+ """\n+ if self.getDirection() == 1:\n+ return "%d-%d" % (self.getStart(), self.getEnd())\n+ else:\n+ return "%d-%d" % (self.getEnd(), self.getStart())\n+\n+ \n+ def extractSequence(self, parser):\n+ """\n+ Get the sequence corresponding to this interval\n+ @param parser: a parser to a FASTA file\n+ @type parser: class L{SequenceListParser<SequenceListParser>}\n+ @return : a instance of L{Sequence<Sequence>}\n+ """\n+ return parser.getSubSequence(self.getChromosome(), self.getStart(), self.getEnd(), self.getDirection(), self.name)\n+ \n+ \n+ def extractWigData(self, parser):\n+ """\n+ Get the data retrieved from a wig file\n+ @param parser: a parser class to a WIG file\n+ @type parser: class L{WigParser<WigParser>}\n+ """\n+ data = parser.getRange(self.getChromosome(), self.getStart(), self.getEnd())\n+ if self.getDirection() == -1:\n+ if parser.strands:\n+ newData = {}\n+ for strand in data:\n+ data[strand].reverse()\n+ newData[-strand] = data[strand]\n+ data = newData\n+ else:\n+ data.reverse()\n+ return data\n+\n+\n+ def __str__(self):\n+ """\n+ Output a simple representation of this interval\n+ """\n+ direction = "+"\n+ if self.getDirection() == -1:\n+ direction = "-"\n+ string = "%s:%d-%d (%s)" % (self.getChromosome(), self.getStart(), self.getEnd(), direction)\n+ if self.name != "":\n+ string = "(%s) %s" % (self.name, string)\n+ return string\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/Mapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Mapping.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,255 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from SMART.Java.Python.structure.SubMapping import SubMapping\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.Interval import Interval\n+\n+class Mapping(object):\n+ """A class that represents a mapping"""\n+\n+ def __init__(self):\n+ self.targetInterval = None\n+ self.queryInterval = None\n+ self.subMappings = []\n+ self.size = None\n+ self.transcript = None\n+ self.tags = {}\n+\n+\n+ def copy(self, mapping):\n+ for subMapping in mapping.subMappings:\n+ newSubMapping = SubMapping(subMapping)\n+ self.addSubMapping(newSubMapping)\n+ self.targetInterval = Interval(mapping.targetInterval)\n+ self.queryInterval = Interval(mapping.queryInterval)\n+ self.size = mapping.size\n+ self.tags = {}\n+ for tag in mapping.tags:\n+ self.tags[tag] = mapping[tag]\n+ self.transcript.copy(mapping.transcript)\n+\n+\n+ def setTargetInterval(self, interval):\n+ self.targetInterval = Interval(interval)\n+ if self.queryInterval != None:\n+ self.setDirection(self.targetInterval.getDirection() * self.queryInterval.getDirection())\n+\n+\n+ def setQueryInterval(self, interval):\n+ self.queryInterval = Interval(interval)\n+ if self.targetInterval != None:\n+ self.setDirection(self.targetInterval.getDirection() * self.queryInterval.getDirection())\n+\n+\n+ def getQueryInterval(self):\n+ return self.queryInterval\n+\n+\n+ def addSubMapping(self, subMapping):\n+ subMappingCopy = SubMapping(subMapping)\n+ self.subMappings.append(subMappingCopy)\n+\n+ if self.targetInterval:\n+ self.targetInterval.setStart(min(self.targetInterval.getStart(), subMapping.targetInterval.getStart()))\n+ self.targetInterval.setEnd(max(self.targetInterval.getEnd(), subMapping.targetInterval.getEnd()))\n+ else:\n+ self.setTargetInterval(subMapping.targetInterval)\n+ if self.queryInterval:\n+ self.queryInterval.setStart(min(self.queryInterval.getStart(), subMapping.queryInterval.getStart()))\n+ self.queryInterval.setEnd(max(self.queryInterval.getEnd(), subMapping.queryInterval.getEnd()))\n+ else:\n+ self.setQueryInterval(subMapping.queryInterval)\n+\n+ if self.getDirection() != 0:\n+ subMapping.setDirection(self.getDirection'..b'ccurrences(self, nbOccurrences):\n+ self.setTagValue("nbOccurrences", nbOccurrences)\n+\n+\n+ def setNbMismatches(self, nbMismatches):\n+ self.setTagValue("nbMismatches", nbMismatches)\n+ if self.size != None and "identity" not in self.getTagNames():\n+ identity = 100 if self.size == 0 else (self.size - self.getTagValue("nbMismatches")) / float(self.size) * 100\n+ self.setTagValue("identity", identity)\n+\n+\n+ def setNbGaps(self, nbGaps):\n+ self.setTagValue("nbGaps", nbGaps)\n+ \n+ \n+ def setRank(self, rank):\n+ self.setTagValue("rank", rank)\n+ \n+\n+ def setEvalue(self, evalue):\n+ self.setTagValue("evalue", evalue)\n+ \n+\n+ def setOccurrence(self, occurrence):\n+ self.setTagValue("occurrence", occurrence)\n+ \n+ \n+ def setBestRegion(self, bestRegion):\n+ self.setTagValue("bestRegion", bestRegion)\n+\n+\n+ def mergeExons(self, distance):\n+ previousSubMapping = None\n+ subMappings = []\n+ for subMapping in self.subMappings:\n+ if previousSubMapping == None:\n+ subMappings.append(subMapping)\n+ previousSubMapping = subMapping\n+ else:\n+ targetDistance = subMapping.targetInterval.getDistance(previousSubMapping.targetInterval)\n+ queryDistance = subMapping.queryInterval.getDistance(previousSubMapping.queryInterval)\n+ if targetDistance <= distance:\n+ self.setTagValue("nbGaps", self.getTagValue("nbGaps") + queryDistance)\n+ previousSubMapping.merge(subMapping)\n+ else:\n+ subMappings.append(subMapping)\n+ previousSubMapping = subMapping\n+ self.subMappings = subMappings\n+ \n+ \n+ def getTranscript(self):\n+ """\n+ Extract a transcript from this mapping\n+ @return: a transcript\n+ """\n+ if self.transcript != None:\n+ return self.transcript\n+ self.transcript = Transcript()\n+ self.transcript.copy(self.targetInterval)\n+ self.transcript.setDirection(self.getDirection())\n+ self.transcript.setName(self.queryInterval.getName())\n+ self.transcript.removeExons()\n+ if len(self.subMappings) > 1:\n+ for subMapping in self.subMappings:\n+ self.transcript.addExon(subMapping.targetInterval)\n+ cpt = 1\n+ for exon in self.transcript.exons:\n+ exon.setDirection(self.transcript.getDirection())\n+ exon.setName("%s-exon%d" % (self.transcript.getName(), cpt))\n+ exon.setChromosome(self.transcript.getChromosome())\n+ cpt += 1\n+ self.transcript.setDirection(self.getDirection())\n+ self.transcript.sortExons()\n+ for tag in self.tags:\n+ if "bestRegion" not in self.getTagNames():\n+ self.transcript.setTagValue("bestRegion", "(self)")\n+ self.transcript.setTagValue(tag, self.getTagValue(tag))\n+ return self.transcript\n+ \n+\n+ def getChromosome(self):\n+ if not self.subMappings:\n+ raise Exception("Error! Mapping \'%s\' has no submapping" % (self))\n+ return self.subMappings[0].targetInterval.getChromosome()\n+\n+\n+ \n+ def getErrorScore(self):\n+ return self.getTagValue("nbGaps") * 3 + self.getTagValue("nbMismatches") + (len(self.subMappings) - 1) * 0.1\n+ \n+\n+ def printGBrowseReference(self):\n+ return self.getTranscript().printGBrowseReference()\n+\n+\n+ def printGBrowseLine(self):\n+ return self.getTranscript().printGBrowseLine()\n+\n+\n+ def printGBrowse(self):\n+ return self.getTranscript().printGBrowse()\n+\n+\n+ def printBed(self):\n+ return self.getTranscript().printBed()\n+\n+\n+ def __str__(self):\n+ return "%s ---- %s" % (str(self.getTranscript()), ", ". join([str(submapping) for submapping in self.subMappings]))\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/Sequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Sequence.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,184 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import re +from commons.core.seq.Bioseq import Bioseq + +reverseComplementString = { + "A": "T", + "C": "G", + "G": "C", + "T": "A", + "U": "A", + "M": "K", + "R": "Y", + "W": "W", + "S": "S", + "Y": "R", + "K": "M", + "V": "B", + "H": "D", + "D": "H", + "B": "V", + "N": "N", + "a": "t", + "c": "g", + "g": "c", + "t": "a", + "u": "a", + "m": "k", + "r": "y", + "w": "w", + "s": "s", + "y": "r", + "k": "m", + "v": "b", + "h": "d", + "d": "h", + "b": "v", + "n": "n" +} + +class Sequence(Bioseq): + """A class that codes for a sequence""" + + def __init__(self, name = "", sequence = ""): + super(Sequence, self).__init__(name, sequence) + self.name = self.header + self.quality = None + self.chunkedSequence = None + self.chunkedQuality = None + self.integerQuality = False + + def setName(self, name=""): + super(Sequence, self).setHeader(name) + + def getName(self): + return self.getHeader() + + def setSequence(self, seq=""): + super(Sequence, self).setSequence(seq) + + def setQuality(self, quality): + if quality == None: + self.quality = None + return + if " " in quality: + self.quality = quality.split() + self.integerQuality = True + else: + self.quality = list(quality) + + def getQuality(self): + if self.quality == None: + return None + if self.integerQuality: + return " ".join(self.quality) + return "".join(self.quality) + + def getSize(self): + return len(self.getSequence()) + + + def copy(self, sequence): + self.setName(sequence.getName()) + self.setSequence(sequence.getSequence()) + self.setQuality(sequence.getQuality()) + self.chunkedSequence = None + self.chunkedQuality = None + + + def chunkSequence(self): + self.chunkedSequence = [] + for i in range (0, self.getSize() / 60 + 1): + self.chunkedSequence.append(self.getSequence()[i * 60 : min(self.getSize(), (i+1) * 60)]) + if self.quality != None: + self.chunkedQuality = [] + for i in range (0, self.getSize() / 60 + 1): + self.chunkedQuality.append(self.quality[i * 60 : min(self.getSize(), (i+1) * 60)]) + + def concatenate(self, seq): + sequence = self.getSequence() + sequence += seq.getSequence() + self.setSequence(sequence) + if self.quality != None: + sep = " " if self.integerQuality else "" + self.setQuality(self.getQuality() + sep + seq.getQuality()) + self.chunkedSequence = None + self.chunkedQuality = None + + + def printFasta(self): + if self.chunkedSequence == None: + self.chunkSequence() + return ">%s\n%s\n" % (self.getHeader(), "\n".join(self.chunkedSequence)) + + + def printFastq(self): + if self.chunkedSequence == None: + self.chunkSequence() + return "@%s\n%s\n+%s\n%s\n" % (self.getHeader(), self.getSequence(), self.getHeader(), self.getQuality()) + + + def reverseComplement(self): + seq = "" + self.chunkedSequence = None + self.chunkedQuality = None + for i in range(0, self.getSize()): + char = self.getSequence()[i:i+1] + if char not in reverseComplementString: + sys.exit("Cannot understand character %s from string %s" % (char, self.getSequence())) + seq = "%s%s" % (reverseComplementString[char], seq) + self.setSequence(seq) + if self.quality != None: + self.quality = self.quality[::-1] + + + def containsAmbiguousNucleotides(self): + m = re.search("[^ACGTUacgtu]", self.getSequence()) + if m != None: + return True + return False + + + def shrinkToFirstNucleotides(self, nbNucleotides): + self.chunkedSequence = None + self.chunkedQuality = None + self.setSequence(self.getSequence()[0:nbNucleotides]) + if self.quality != None: + self.quality = self.quality[0:nbNucleotides] + + + def shrinkToLastNucleotides(self, nbNucleotides): + self.chunkedSequence = None + self.chunkedQuality = None + self.setSequence(self.getSequence()[-nbNucleotides:]) + if self.quality != None: + self.quality = self.quality[-nbNucleotides:] |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/SequenceList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/SequenceList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,72 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import math + +class SequenceList(object): + """A class that codes for a list of sequences""" + + def __init__(self, verbosity = 0): + self.sequences = [] + self.verbosity = verbosity + + + def nbSequences(self): + return len(self.sequences) + + + def getSequence(self, index): + return self.sequences[index] + + + def addSequence(self, sequence): + self.sequences.append(sequence) + + + def split(self, number): + sequenceLists = [] + size = math.ceil(self.nbSequences() / number) + + sequenceList = SequenceList() + for i in range(0, self.nbSequences()): + sequenceList.addSequence(self.getSequence(i)) + if (sequenceList.nbSequences() == size): + sequenceLists.append(sequenceList) + sequenceList = SequenceList() + if (sequenceList.nbSequences() != 0): + sequenceLists.append(sequenceList) + return sequenceLists + + + def printFasta(self): + string = "" + for sequence in self.sequences: + string += sequence.printFasta() + return string + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/SubMapping.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/SubMapping.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,258 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from SMART.Java.Python.structure.Interval import Interval\n+from commons.core.coord.Align import Align\n+\n+class SubMapping(Align):\n+ """\n+ A class that represents a part of a mapping, more precisely, a pair (target interval, query interval) that match together\n+ @ivar targetInterval: the target interval\n+ @type targetInterval: class L{Interval<Interval>}\n+ @ivar queryInterval: the query interval\n+ @type queryInterval: class L{Interval<Interval>}\n+ @ivar size: size of this sub-mapping\n+ @type size: int\n+ @ivar tags: various information\n+ @type tags: dict\n+ """\n+\n+ def __init__(self, subMapping = None):\n+ """\n+ Constructor\n+ @param subMapping: a sub-mapping to be copied\n+ @type subMapping: class L{SubMapping<SubMapping>}\n+ """\n+ self.targetInterval = Interval()\n+ self.queryInterval = Interval()\n+ Align.__init__(self, self.queryInterval, self.targetInterval)\n+ self.size = None\n+ self.tags = {}\n+ if subMapping != None:\n+ self.copy(subMapping)\n+ \n+ def __eq__(self, o):\n+ if o == None:\n+ return False\n+ areAlignAttributesEquals = Align.__eq__(self, o)\n+ return areAlignAttributesEquals and (self.targetInterval == o.targetInterval) and (self.queryInterval == o.queryInterval) and self.size == o.getSize() and self.tags == o.getTags()\n+ \n+ def getSuperAdress(self):\n+ return hex(id(super(Align, self)))\n+ \n+# def setRangesAlignToRangesInterval(self):\n+# self.range_query = super(Range, self.queryInterval)\n+# self.range_subject = super(Range, self.targetInterval)\n+ \n+ def copy(self, subMapping):\n+ """\n+ Copy method\n+ @param subMapping: a sub-mapping to be copied\n+ @type subMapping: class L{SubMapping<SubMapping>}\n+ """\n+ self.setQueryName(subMapping.getQueryName())\n+ self.setQueryStart(subMapping.getQueryStart())\n+ self.setQueryEnd(subMapping.getQueryEnd())\n+ self.setSubjectName(subMapping.getSubjectName())\n+ self.setSubjectStart(subMapping.getSubjectStart())\n+ self.setSubjectEnd(subMapping.getSubjectEnd())\n+ self.e_value = subMapping.getEvalue()\n+ self.score = subMapping.getScore()\n+ self.identity = subMapping.getIdentity()\n+ \n+ self.targetInterval.copy(subMapping.targetInterval)\n+ sel'..b' @type name: string\n+ @param value: value of the tag\n+ @type value: string or int\n+ """\n+ self.tags[name] = value\n+\n+\n+ def getTagValue(self, name):\n+ """\n+ Get the value of a tag\n+ @param name: name of the tag\n+ @type name: string\n+ @return: value of the tag\n+ """\n+ return self.tags[name]\n+\n+ \n+ def getTagNames(self):\n+ """\n+ Get all the names of the tags\n+ @return: the names of the tags\n+ """\n+ return self.tags.keys()\n+\n+ def getTargetInterval(self):\n+ return self.targetInterval\n+ \n+ def getQueryInterval(self):\n+ return self.queryInterval\n+ \n+ def getSize(self):\n+ return self.size\n+ \n+ def getTags(self):\n+ return self.tags\n+\n+ def setIdentity(self, identity):\n+ """\n+ Set the percentage of identity of the sub-mapping\n+ Possibly also set number of mismatches\n+ @param identity: the percentage of identity of the sub-mapping\n+ @type identity: float\n+ """\n+ self.identity = identity\n+ self.setTagValue("identity", identity)\n+ if self.size != None and "nbMismatches" not in self.getTagNames():\n+ self.setTagValue("nbMismatches", self.size - round(self.size * self.getTagValue("identity") / 100.0))\n+\n+\n+ def setNbMismatches(self, nbMismatches):\n+ """\n+ Set the number of mismatches of the sub-mapping\n+ Possibly also set percentage of identity\n+ @param nbMismatches: the number of mismatches of the sub-mapping\n+ @type nbMismatches: int\n+ """\n+ self.nbMismatches = nbMismatches\n+ if self.size != None and "identity" not in self.getTagNames():\n+ self.setTagValue("identity", (self.size - self.getTagValue("nbMismatches")) / float(self.size) * 100)\n+\n+\n+ def setNbGaps(self, nbGaps):\n+ """\n+ Set the number of gaps of the sub-mapping\n+ @param nbGaps: the number of gaps of the sub-mapping\n+ @type nbGaps: int\n+ """\n+ self.setTagValue("nbGaps", nbGaps)\n+ \n+ \n+ def merge(self, subMapping):\n+ """\n+ Merge two subMappings\n+ @param subMapping: another sub-mapping\n+ @type subMapping: class L{SubMapping<SubMapping>}\n+ """\n+ self.targetInterval.merge(subMapping.targetInterval)\n+ self.queryInterval.merge(subMapping.queryInterval)\n+\n+\n+ def printCoordinates(self):\n+ """\n+ Print the coordinates of the sub-mapping (considering the direction)\n+ @return: a string\n+ """\n+ if self.getDirection() == 1:\n+ return "%d-%d" % (self.targetInterval.getStart(), self.targetInterval.getEnd())\n+ else:\n+ return "%d-%d" % (self.targetInterval.getEnd(), self.targetInterval.getStart())\n+\n+\n+ def __str__(self):\n+ """\n+ Return a representation of this object\n+ @return: a string\n+ """\n+\n+ if "match" in self.getTagNames() and not self.getTagValue("match"):\n+ return "%s ---" % self.queryName\n+\n+ direction = "+"\n+ if self.getDirection() == -1:\n+ direction = "-"\n+ string = "%s:%d-%d -- %s:%d-%d (%s)" % (self.targetInterval.getChromosome(), self.targetInterval.getStart(), self.targetInterval.getEnd(), self.queryInterval.name, self.queryInterval.getStart(), self.queryInterval.getEnd(), direction)\n+ if "nbMismatches" in self.getTagNames():\n+ string += "(%i mm)" % (self.getTagValue("nbMismatches"))\n+ if "identity" in self.getTagNames():\n+ string += "(id: %i%%)" % (self.getTagValue("identity"))\n+ if self.targetInterval.getSize() != None and self.queryInterval.getSize() != None and self.size != None:\n+ string += "(sizes: %d, %d -> %d)" % (self.targetInterval.getSize(), self.queryInterval.getSize(), self.size)\n+ return string\n+\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/Transcript.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/Transcript.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,876 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import sys\n+from SMART.Java.Python.structure.Interval import Interval\n+from SMART.Java.Python.structure.Sequence import Sequence\n+\n+\n+class Transcript(Interval):\n+\t"""\n+\tA class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals)\n+\t@ivar exons: a list of exons (intervals)\n+\t@type exons: list of L{Interval{Interval}}\n+\t"""\n+\n+\tdef __init__(self, transcript = None, verbosity = 0):\n+\t\t"""\n+\t\tConstructor\n+\t\t@param transcript: transcript to be copied\n+\t\t@type transcript: class L{Transcript<Transcript>}\n+\t\t@param verbosity: verbosity\n+\t\t@type verbosity: int\n+\t\t"""\n+\t\tsuper(Transcript, self).__init__(None, verbosity)\n+\t\tself.exons = []\n+\t\tself.introns = None\n+\t\tif transcript != None:\n+\t\t\tself.copy(transcript)\n+\n+\n+\tdef copy(self, transcript):\n+\t\t"""\n+\t\tCopy method\n+\t\t@param transcript: transcript to be copied\n+\t\t@type\ttranscript: class L{Transcript<Transcript>} or L{Interval<Interval>}\n+\t\t"""\n+\t\tsuper(Transcript, self).copy(transcript)\n+\t\tif transcript.__class__.__name__ == "Transcript":\n+\t\t\texons = transcript.getExons()\n+\t\t\tif len(exons) > 1:\n+\t\t\t\tfor exon in exons:\n+\t\t\t\t\texonCopy = Interval(exon)\n+\t\t\t\t\tself.addExon(exonCopy)\n+\n+\n+\tdef setDirection(self, direction):\n+\t\t"""\n+\t\tSet the direction of the interval\n+\t\tPossibly parse different formats\n+\t\tImpact all exons\n+\t\t@param direction: direction of the transcript (+ / -)\n+\t\t@type\tdirection: int or string\n+\t\t"""\n+\t\tsuper(Transcript, self).setDirection(direction)\n+\t\tfor exon in self.exons:\n+\t\t\texon.setDirection(direction)\n+\t\t\t\n+\n+\tdef setChromosome(self, chromosome):\n+\t\t"""\n+\t\tSet the chromosome\n+\t\t@param chromosome: chromosome on which the transcript is\n+\t\t@type chromosome: string\n+\t\t"""\n+\t\tsuper(Transcript, self).setChromosome(chromosome)\n+\t\tfor exon in self.exons:\n+\t\t\texon.setChromosome(chromosome)\n+\n+\t\n+\tdef addExon(self, exon):\n+\t\t"""\n+\t\tAdd an exon to the list of exons\n+\t\t@param exon: a new exon\n+\t\t@type exon: class L{Interval<Interval>}\n+\t\t"""\n+\t\tif not self.exons and not exon.overlapWith(self):\n+\t\t\tfirstExon = Interval()\n+\t\t\tfirstExon.setStart(self.getStart())\n+\t\t\tfirstExon.setEnd(self.getEnd())\n+\t\t\tfirstExon.setDirection(self.getDirection())\n+\t\t\tfirstExon.setChromosome(self.getChromosome())\n+\t\t\tself.exons.append(firstExon)\n+\t\tnewExon = Interval(exon)\n+\t\tnewExon.setDirection(self.getDirection())\n+\t\tself.exons.append(newExon)\n+\t\tif newExon.getStart() < self.getStart():\n+\t\t\tself.setSta'..b'\tif i == 0:\n+\t\t\t\tcontinue\n+\t\t\tcigar += "%dN" % (exon.getStart() - lastExonEnd - 1)\n+\t\t\tcigar += "%dM" % (exon.getSize())\n+\n+\t\treturn "%s\\t%d\\t%s\\t%d\\t%d\\t%s\\t%s\\t%d\\t%d\\t%s\\t%s\\t%s\\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags)\n+\n+\n+\tdef printUcsc(self):\n+\t\t"""\n+\t\tExport this transcript using UCSC BED format\n+\t\t@return: a string\n+\t\t"""\n+\t\tif self.getChromosome().find("Het") != -1:\n+\t\t\treturn ""\n+\t\tname\t = self.name\n+\t\tcomment = self.getTagValues(";", "")\n+\t\tsizes\t = []\n+\t\tstarts\t= []\n+\t\tdirection = "+"\n+\t\tif self.getDirection() == -1:\n+\t\t\tdirection = "-"\n+\t\tself.sortExonsIncreasing()\n+\t\tfor exon in self.getExons():\n+\t\t\tsizes.append("%d" % (exon.getSize()))\n+\t\t\tstarts.append("%d" % (exon.getStart() - self.getStart()))\n+\t\treturn "%s\\t%d\\t%d\\t%s\\t1000\\t%s\\t%d\\t%d\\t0\\t%d\\t%s,\\t%s,\\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))\n+\n+\n+\tdef printGBrowseReference(self):\n+\t\t"""\n+\t\tExport this transcript using GBrowse format (1st line only)\n+\t\t@return: a string\n+\t\t"""\n+\t\treturn "reference = %s\\n" % (self.getChromosome())\n+\n+\n+\tdef printGBrowseLine(self):\n+\t\t"""\n+\t\tExport this transcript using GBrowse format (2nd line only)\n+\t\t@return: a string\n+\t\t"""\n+\t\tself.sortExons()\n+\t\tcoordinates = []\n+\t\tfor exon in self.getExons():\n+\t\t\tcoordinates.append(exon.printCoordinates())\n+\t\tcoordinatesString = ",".join(coordinates)\n+\t\tcomment = self.getTagValues(";", "=")\n+\t\tif comment:\n+\t\t\tcomment = "\\t\\"%s\\"" % (comment)\n+\t\treturn "User_data\\t%s\\t%s%s\\n" % (self.name, coordinatesString, comment)\n+\n+\t\n+\tdef printGBrowse(self):\n+\t\t"""\n+\t\tExport this transcript using GBrowse format\n+\t\t@return: a string\n+\t\t"""\n+\t\treturn "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine())\n+\n+\n+\tdef printCsv(self):\n+\t\t"""\n+\t\tExport this transcript using CSV format\n+\t\t@return: a string\n+\t\t"""\n+\t\tself.sortExons()\n+\t\tstring = "%s,%d,%d,\\"%s\\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-")\n+\t\tif len(self.getExons()) == 1:\n+\t\t\tstring += "None"\n+\t\telse:\n+\t\t\tfor exon in self.getExons():\n+\t\t\t\tstring += "%d-%d " % (exon.getStart(), exon.getEnd())\n+\t\tfor tag in sorted(self.tags.keys()):\n+\t\t\tstring += ",%s=%s" % (tag, str(self.tags[tag]))\n+\t\tstring += "\\n"\n+\t\treturn string\n+\n+\n+\tdef extractSequence(self, parser):\n+\t\t"""\n+\t\tGet the sequence corresponding to this transcript\n+\t\t@param parser: a parser to a FASTA file\n+\t\t@type parser: class L{SequenceListParser<SequenceListParser>}\n+\t\t@return:\t an instance of L{Sequence<Sequence>}\n+\t\t"""\n+\t\tself.sortExons()\n+\t\tname = self.name\n+\t\tif "ID" in self.getTagNames() and self.getTagValue("ID") != self.name:\n+\t\t\tname += ":%s" % (self.getTagValue("ID"))\n+\t\tsequence = Sequence(name)\n+\t\tfor exon in self.getExons():\n+\t\t\tsequence.concatenate(exon.extractSequence(parser))\n+\t\treturn sequence\n+\t\n+\t\n+\tdef extractWigData(self, parser):\n+\t\t"""\n+\t\tGet some wig data corresponding to this transcript\n+\t\t@param parser: a parser to a wig file\n+\t\t@type parser: class L{WigParser<WigParser>}\n+\t\t@return: a sequence of float\n+\t\t"""\n+\t\tself.sortExons()\n+\t\tif parser.strands:\n+\t\t\tstrands = (-1, 1)\n+\t\t\tvalues = dict([(strand, []) for strand in strands])\n+\t\t\tfor exon in self.getExons():\n+\t\t\t\ttheseValues = exon.extractWigData(parser)\n+\t\t\t\tif self.getDirection() == -1:\n+\t\t\t\t\tfor strand in strands:\n+\t\t\t\t\t\ttheseValues[strand].reverse()\n+\t\t\t\tfor strand in strands:\n+\t\t\t\t\tvalues[strand].extend(theseValues[strand])\n+\t\t\tif self.getDirection() == -1:\n+\t\t\t\tfor strand in strands:\n+\t\t\t\t\tvalues[strand].reverse()\n+\t\t\treturn values\n+\t\telse:\n+\t\t\tvalues = []\n+\t\t\tfor exon in self.getExons():\n+\t\t\t\ttheseValues = exon.extractWigData(parser)\n+\t\t\t\t#if self.getDirection() == -1:\n+\t\t\t\t#\ttheseValues.reverse()\n+\t\t\t\tvalues.extend(theseValues)\n+\t\t\t#if self.getDirection() == -1:\n+\t\t\t#\tvalues.reverse()\n+\t\t\treturn values\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/TranscriptContainer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptContainer.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,236 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import re\n+import sys\n+from commons.core.parsing.ParserChooser import ParserChooser\n+from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable\n+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter\n+\n+class TranscriptContainer(object):\n+ """\n+ An interface class that contains a list of transcripts, handle different formats\n+ @ivar container: container of the data\n+ @type container: string \n+ @ivar format: format of the data\n+ @type format: string \n+ @ivar transcriptListParser: possibly contains a parser to a list of transcripts\n+ @type transcriptListParser: L{TranscriptListParser<TranscriptListParser>} or None\n+ @ivar mappingListParser: possibly contains a parser to a list of mappings\n+ @type mappingListParser: L{MapperParser<MapperParser>} or None\n+ @ivar transcriptTables: possibly contains the mySQL tables\n+ @type transcriptTables: dict of L{MySqlTranscriptTable<MySqlTranscriptTable>} or None\n+ @ivar mySqlConnection: connection to a MySQL database\n+ @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}\n+ @ivar type: type of the data (transcripts, mappings or mySQL)\n+ @type type: string\n+ @ivar verbosity: verbosity\n+ @type verbosity: int \n+ """\n+\n+ def __init__(self, container, format, verbosity = 0):\n+ """\n+ Constructor\n+ @param container: container of the data\n+ @type container: string\n+ @param format: format of the data\n+ @type format: string\n+ @param verbosity: verbosity\n+ @type verbosity: int\n+ """\n+ self.container = container\n+ self.format = format\n+ self.verbosity = verbosity\n+ self.transcriptListParser = None\n+ self.mappingListParser = None\n+ self.transcriptTables = {}\n+ self.mySqlConnection = None\n+ self.foundData = False\n+ self.nbTranscripts = None\n+ self.nbNucleotides = None\n+ self.chromosomes = None\n+ self.type = None\n+ if self.container == None:\n+ sys.exit("Error! Container input file name is empty!")\n+ if self.format == None:\n+ sys.exit("Error! Container input format is empty!")\n+ \n+ \n+ def findData(self):\n+ """\n+ Load data\n+ """\n+ if self.format == None:\n+ sys.ex'..b'ndle format \'%s\'!" % (self.format))\n+\n+ if self.transcriptListParser != None:\n+ if self.type == "transcript":\n+ self.nbTranscripts = self.transcriptListParser.getNbTranscripts()\n+ self.nbNucleotides = self.transcriptListParser.getNbNucleotides()\n+ self.chromosomes = self.transcriptListParser.getChromosomes()\n+ if self.mappingListParser != None:\n+ if self.type == "mapping":\n+ self.nbTranscripts = self.mappingListParser.getNbMappings()\n+ self.nbNucleotides = self.mappingListParser.getNbNucleotides()\n+ self.chromosomes = self.mappingListParser.getChromosomes()\n+\n+ self.foundData = True\n+\n+\n+ def getNbTranscripts(self):\n+ """\n+ Get the number of transcripts\n+ @return: the number of transcripts\n+ """\n+ if not self.foundData:\n+ self.findData()\n+ return self.nbTranscripts\n+ \n+ \n+ def getNbItems(self):\n+ """\n+ Same as getNbTranscripts\n+ """\n+ return self.getNbTranscripts()\n+\n+\n+ def getNbNucleotides(self):\n+ """\n+ Get the number of nucleotides\n+ @return: the number of nucleotides\n+ """\n+ if not self.foundData:\n+ self.findData()\n+ return self.nbNucleotides\n+\n+\n+ def getChromosomes(self):\n+ """\n+ Get the chromosomes\n+ @return: the chromosomes\n+ """\n+ if not self.foundData:\n+ self.findData()\n+ return self.chromosomes\n+ \n+\n+ def getIterator(self):\n+ """\n+ An iterator\n+ @return: an iterator to a list of transcripts\n+ """\n+ if not self.foundData:\n+ self.findData()\n+ if self.type == "sql":\n+ for chromosome in self.transcriptTables:\n+ for transcript in self.transcriptTables[chromosome].getIterator():\n+ yield transcript\n+ return\n+ if self.type == "transcript":\n+ for transcript in self.transcriptListParser.getIterator():\n+ yield transcript\n+ return\n+ if self.type == "mapping":\n+ for mapping in self.mappingListParser.getIterator():\n+ yield mapping.getTranscript()\n+ return\n+ sys.exit("Error! No valid transcript container given!")\n+ \n+ \n+ def storeIntoDatabase(self, name = None):\n+ """\n+ Store the current transcript / mapping list into database\n+ """\n+ if not self.foundData:\n+ self.findData()\n+\n+ if (self.transcriptListParser == None and self.mappingListParser == None) or len(self.transcriptTables.keys()) != 0:\n+ return\n+ \n+ mySqlTranscriptWriter = MySqlTranscriptWriter(self.mySqlConnection, name, self.verbosity)\n+ mySqlTranscriptWriter.addTranscriptList(self.transcriptListParser if self.transcriptListParser else self.mappingListParser)\n+ mySqlTranscriptWriter.write()\n+ self.transcriptTables = mySqlTranscriptWriter.getTables()\n+ self.type = "sql"\n+ \n+ \n+ def getTables(self):\n+ """\n+ Accessor to the mySQL tables\n+ @return: the mySQL tables\n+ """\n+ return self.transcriptTables\n+ \n+\n+ def setDefaultTagValue(self, name, value):\n+ """\n+ Set the given tag to the value for all transcripts\n+ @param name: name of the tag\n+ @type name: string\n+ @param value: value of the tag\n+ @type value: string\n+ """\n+ if self.type == "sql":\n+ for chromosome in self.transcriptTables:\n+ self.transcriptTables[chromosome].setDefaultTagValue(name, value)\n+ elif self.type == "transcript":\n+ self.transcriptListParser.setDefaultTagValue(name, value)\n+ elif self.type == "mapping":\n+ self.mappingListParser.setDefaultTagValue(name, value)\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/TranscriptList.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptList.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,172 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.Progress import Progress + + +class TranscriptList(object): + """A class that codes for a list of transcript""" + + def __init__(self, verbosity = 0): + self.transcripts = dict() + self.longestTranscript = 0 + self.verbosity = verbosity + + + def getTranscript(self, chromosome, index): + return self.transcripts[chromosome][index] + + + def getChromosomes(self): + return self.transcripts.keys() + + + def getTranscriptsOnChromosome(self, chromosome): + if chromosome not in self.transcripts: + return [] + return self.transcripts[chromosome] + + + def addTranscript(self, transcript): + if transcript.getChromosome() in self.transcripts: + self.transcripts[transcript.getChromosome()].append(transcript) + else: + self.transcripts[transcript.getChromosome()] = [transcript] + self.longestTranscript = max(self.longestTranscript, transcript.getEnd() - transcript.getStart()) + + + def removeTranscript(self, chromosome, i): + del self.transcripts[chromosome][i] + + + def removeAll(self): + self.transcripts = {} + + + def getNbTranscripts(self): + nbTranscripts = 0 + for chromosome in self.transcripts: + nbTranscripts += len(self.transcripts[chromosome]) + return nbTranscripts + + + def getSize(self): + size = 0 + for chromosome in self.transcripts: + for transcript in self.transcripts[chromosome]: + size += transcript.getSize() + return size + + + def sort(self): + for chromosome in self.transcripts: + self.transcripts[chromosome].sort(lambda x, y: x.getStart() - y.getStart()) + + + def removeOverlapWith(self, transcriptList): + transcriptList.sort() + for chromosome in self.transcripts: + progress = Progress(len(self.transcripts[chromosome]), "Handling chromosome %s" % (chromosome), self.verbosity) + for thisTranscriptId in range(len(self.transcripts[chromosome])): + progress.inc() + for thatTranscriptId in range(len(transcriptList.transcripts[chromosome])): + if self.transcripts[chromosome][thisTranscriptId].overlapWith(transcriptList.transcripts[chromosome][thatTranscriptId]): + self.transcripts[chromosome][thisTranscriptId] = None + break + if self.transcripts[chromosome][thisTranscriptId].getEnd() > transcriptList.transcripts[chromosome][thatTranscriptId]: + break + self.transcripts[chromosome] = [transcript for transcript in self.transcripts[chromosome] if transcript != None] + progress.done() + + + def removeOverlapWithExon(self, transcriptList): + transcriptList.sort() + for chromosome in self.transcripts: + progress = Progress(len(self.transcripts[chromosome]), "Handling chromosome %s" % (chromosome), self.verbosity) + for thisTranscriptId in range(len(self.transcripts[chromosome])): + progress.inc() + for thatTranscriptId in range(len(transcriptList.transcripts[chromosome])): + if self.transcripts[chromosome][thisTranscriptId].overlapWithExon(transcriptList.transcripts[chromosome][thatTranscriptId]): + self.transcripts[chromosome][thisTranscriptId] = None + break + if self.transcripts[chromosome][thisTranscriptId].getEnd() > transcriptList.transcripts[chromosome][thatTranscriptId]: + break + self.transcripts[chromosome] = [transcript for transcript in self.transcripts[chromosome] if transcript != None] + progress.done() + + + def setDefaultTagValue(self, name, value): + for transcript in self.getIterator(): + transcript.setTag(name, value) + + + def storeDatabase(self, mySqlConnection): + transcriptsTable = MySqlTable("TmpTranscriptsTable", mySqlConnection) + transcriptsTable.create(Transcript.getSqlVariables(), Transcript.getSqlTypes()) + intervalsVariables = Interval.getSqlVariables() + intervalsVariables.append("idTranscript") + intervalsTypes = Interval.getSqlTypes() + intervalsTypes["idTranscript"] = "int" + intervalsTable = MySqlTable("TmpIntervalsTable", mySqlConnection) + intervalsTable.create(intervalsVariables, intervalsTypes) + for chromosome in self.transcripts: + for transcript in self.transcripts[chromosome]: + idTranscript = transcriptsTable.addLine(transcript.getSqlValues()) + for exon in transcript.getExons(): + intervalValues = exon.getSqlValues() + intervalValues["idTranscript"] = idTranscript + intervalsTable.addLine(intervalValues) + + + def getIterator(self): + chromosomes = self.transcripts.keys() + currentChromosome = 0 + currentTranscript = 0 + while True: + if currentChromosome >= len(chromosomes): + return + elif currentTranscript >= len(self.transcripts[chromosomes[currentChromosome]]): + currentTranscript = 0 + currentChromosome += 1 + elif self.transcripts[chromosomes[currentChromosome]][currentTranscript] == None: + currentTranscript += 1 + else: + yield self.transcripts[chromosomes[currentChromosome]][currentTranscript] + currentTranscript += 1 + + + def __str__(self): + string = "" + for transcript in self.getIterator(): + string += str(transcript) + return string + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/TranscriptListIterator.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptListIterator.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,58 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class TranscriptListIterator(object): + """A class that iterates on a list of transcript""" + + def __init__(self, transcriptList, verbosity = 0): + self.transcriptList = transcriptList + self.verbosity = verbosity + self.chromosomes = self.transcriptList.transcripts.keys() + self.currentChromosome = 0 + self.currentTranscript = -1 + + + def __iter__(self): + return self + + + def next(self): + self.currentTranscript += 1 + while True: + if self.currentChromosome >= len(self.transcriptList.transcripts): + raise StopIteration + elif self.currentTranscript >= len(self.transcriptList.transcripts[self.chromosomes[self.currentChromosome]]): + self.currentTranscript = 0 + self.currentChromosome += 1 + elif self.transcriptList.transcripts[self.chromosomes[self.currentChromosome]][self.currentTranscript] == None: + self.currentTranscript += 1 + else: + return self.transcriptList.transcripts[self.chromosomes[self.currentChromosome]][self.currentTranscript] + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/structure/TranscriptListsComparator.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptListsComparator.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,1198 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import sys\n+import random\n+from SMART.Java.Python.misc import Utils\n+from SMART.Java.Python.structure.Transcript import Transcript\n+from SMART.Java.Python.structure.TranscriptList import TranscriptList\n+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer\n+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection\n+from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable\n+from SMART.Java.Python.misc.Progress import Progress\n+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter\n+\n+\n+\n+class TranscriptListsComparator(object):\n+ """\n+ Compare two transcript lists, using a database for one of the list\n+ Uses one TranscriptContainer for query data, \n+ one TranscriptContainer exported to MySqlTranscriptTable for reference data, \n+ one MySqlTranscriptTable for transformed reference data\n+ @ivar inputTranscriptContainers: parsers to the list of query transcripts\n+ @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}\n+ @ivar writer: transcript list writer\n+ @type writer: class L{TranscriptListWriter<TranscriptListWriter>}\n+ @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently)\n+ @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}\n+ @ivar introns: compare transcripts or exons only\n+ @type introns: list of 2 boolean\n+ @ivar starts: restrict the query transcripts to first nucleotides\n+ @type starts: list of 2 int or None\n+ @ivar fivePrimes: extend a list of transcripts by their 5\' end\n+ @type fivePrimes: list of 2 int or None\n+ @ivar threePrimes: extend a list of transcripts by their 3\' end\n+ @type threePrimes: list of 2 int or None\n+ @ivar minDistance: min distance between two transcripts [default: 0]\n+ @type minDistance: int\n+ @ivar maxDistance: max distance between two transcripts [default: 0]\n+ @type maxDistance: int\n+ @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap\n+ @type minOverlap: int\n+ @ivar pcOverlap: percentage of overlapping nucleotides to declare an ove'..b' for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):\n+ transcripts2.append(transcript2)\n+ command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)\n+ self.mySqlConnection.executeQuery(command)\n+\n+ # compare sets\n+ toBeRemoved1 = []\n+ for index1, transcript1 in enumerate(transcripts1):\n+ newTranscript1 = Transcript()\n+ newTranscript1.copy(transcript1)\n+ for transcript2 in transcripts2:\n+ newTranscript1 = newTranscript1.getDifference(transcript2)\n+ if newTranscript1 == None:\n+ toBeRemoved1.append(index1)\n+ break\n+ transcripts1[index1] = newTranscript1\n+\n+ # check if query transcript extends bounds of the chunk\n+ if newTranscript1 != None and newTranscript1.getEnd() < end:\n+ if self.splitDifference:\n+ for exon in newTranscript1.getExons():\n+ transcript = Transcript()\n+ transcript.copy(exon)\n+ self.writeTranscript(transcript)\n+ else:\n+ self.writeTranscript(newTranscript1)\n+ toBeRemoved1.append(index1)\n+\n+ # update list of query transcripts\n+ for index1 in reversed(toBeRemoved1):\n+ del transcripts1[index1]\n+\n+ # check if the reference transcripts extends bounds of the chunk\n+ toBeRemoved2 = []\n+ for index2, transcript2 in enumerate(transcripts2):\n+ if transcript2.getEnd() + distance < end:\n+ toBeRemoved2.append(index2)\n+ for index2 in reversed(toBeRemoved2):\n+ del transcripts2[index2]\n+\n+ progress.inc()\n+\n+ for transcript1 in transcripts1:\n+ if self.splitDifference:\n+ for exon in transcript1.getExons():\n+ transcript = Transcript()\n+ transcript.copy(exon)\n+ self.writeTranscript(transcript)\n+ else:\n+ self.writeTranscript(transcript1)\n+ progress.done()\n+ self.getTables(self.QUERY)[chromosome1].remove()\n+ if chromosome1 in self.getTables(self.REFERENCE):\n+ self.getTables(self.REFERENCE)[chromosome1].remove()\n+ self.getTables(self.WORKING)[chromosome1].remove()\n+\n+ self.flushData()\n+ if self.writer != None:\n+ self.writer.close()\n+ self.writer = None\n+\n+ if self.verbosity > 0:\n+ print "query: %d elements" % (self.nbTranscripts[self.QUERY])\n+ print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])\n+ print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)\n+\n+\n+ def getOddsPerTranscript(self):\n+ """\n+ Return overlap results\n+ @return a dict of data\n+ """\n+ if not self.odds:\n+ raise Exception("Did not compute odds!")\n+ return self.overlapResults\n+\n+\n+ def getOdds(self):\n+ """\n+ Return odds about the overlap\n+ @return a dict of data\n+ """\n+ if not self.odds:\n+ raise Exception("Did not compute odds!")\n+ if self.oddResults != None:\n+ return self.oddResults\n+ self.oddResults = {}\n+ for name, value in self.overlapResults.iteritems():\n+ self.oddResults[value] = self.oddResults.get(value, 0) + 1\n+ return self.oddResults\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/Java/Python/trimSequences.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/trimSequences.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,149 @@\n+#! /usr/bin/env python\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+from optparse import OptionParser\n+from commons.core.parsing.FastaParser import FastaParser\n+from commons.core.parsing.FastqParser import FastqParser\n+from commons.core.writer.FastaWriter import FastaWriter\n+from commons.core.writer.FastqWriter import FastqWriter\n+from SMART.Java.Python.misc.Progress import Progress\n+from SMART.Java.Python.misc import Utils\n+\n+\n+if __name__ == "__main__":\n+ \n+ # parse command line\n+ description = "Trim Sequences v1.0.3: Remove the 5\' and/or 3\' adaptors of a list of reads. [Category: Data Modification]"\n+\n+ parser = OptionParser(description = description)\n+ parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in sequence format given by -f]")\n+ parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [compulsory] [format: sequence file format]")\n+ parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in sequence format given by -f]")\n+ parser.add_option("-3", "--threePAdaptor", dest="threePAdaptor", action="store", default=None, type="string", help="3\' adaptor [format: string] [default: None]")\n+ parser.add_option("-5", "--fivePAdaptor", dest="fivePAdaptor", action="store", default=None, type="string", help="5\' adaptor [format: string] [default: None]")\n+ parser.add_option("-e", "--errors", dest="errors", action="store", default=0, type="int", help="number of errors in percent [format: int] [default: 0]")\n+ parser.add_option("-d", "--indels", dest="indels", action="store_true", default=False, help="also accept indels [format: bool] [default: False]")\n+ parser.add_option("-n", "--noAdaptor5p", dest="noAdaptor5p", action="store", default=None, type="string", help="print sequences with no 5\' adaptor [format: output file in sequence format given by -f]")\n+ parser.add_option("-m", "--noAdaptor3p", dest="noAdaptor3p", action="store", default=None, type="string", help="print sequences with no 3\' adaptor [format: output file in sequence format given by -f]")\n+ parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default'..b'ormat))\n+\n+\n+ if options.noAdaptor5p != None:\n+ if options.format == "fasta":\n+ writer5pNoAdaptor = FastaWriter(options.noAdaptor5p, options.verbosity)\n+ elif options.format == "fastq":\n+ writer5pNoAdaptor = FastqWriter(options.noAdaptor5p, options.verbosity)\n+ else:\n+ raise Exception("Cannot handle files with \'%s\' format." % (options.format))\n+ nbFound5p = 0\n+ \n+ if options.noAdaptor3p != None:\n+ if options.format == "fasta":\n+ writer3pNoAdaptor = FastaWriter(options.noAdaptor3p, options.verbosity)\n+ elif options.format == "fastq":\n+ writer3pNoAdaptor = FastqWriter(options.noAdaptor3p, options.verbosity)\n+ else:\n+ raise Exception("Cannot handle files with \'%s\' format." % (options.format))\n+ nbFound3p = 0\n+ \n+ progress = Progress(parser.getNbSequences(), "Reading %s" % (options.inputFileName), options.verbosity)\n+ for sequence in parser.getIterator():\n+ progress.inc()\n+ if options.threePAdaptor != None:\n+ nucleotides = sequence.sequence\n+ found = False\n+ bestScore = 10000\n+ bestRegion = 0\n+ for i in range(len(nucleotides) - minSize):\n+ nucleotidesPart = nucleotides[i:]\n+ adaptorPart = options.threePAdaptor if len(nucleotidesPart) >= len(options.threePAdaptor) else options.threePAdaptor[:len(nucleotidesPart)]\n+ nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[:len(adaptorPart)]\n+ if options.indels:\n+ score = Utils.getLevenshteinDistance(adaptorPart, nucleotidesPart)\n+ else:\n+ score = Utils.getHammingDistance(adaptorPart, nucleotidesPart)\n+ if score <= int(options.errors / 100.0 * len(adaptorPart)) and score < bestScore:\n+ bestScore = score\n+ bestRegion = i\n+ found = True\n+ if found:\n+ nbFound3p += 1\n+ sequence.shrinkToFirstNucleotides(bestRegion)\n+ elif options.noAdaptor3p:\n+ writer3pNoAdaptor.addSequence(sequence)\n+ if options.fivePAdaptor != None:\n+ nucleotides = sequence.sequence\n+ found = False\n+ bestScore = 10000\n+ bestRegion = 0\n+ for i in reversed(range(minSize, len(nucleotides))):\n+ nucleotidesPart = nucleotides[:i]\n+ adaptorPart = options.fivePAdaptor if len(nucleotidesPart) >= len(options.fivePAdaptor) else options.fivePAdaptor[-len(nucleotidesPart):]\n+ nucleotidesPart = nucleotidesPart if len(adaptorPart) == len(nucleotidesPart) else nucleotidesPart[-len(adaptorPart):]\n+ if options.indels:\n+ score = Utils.getLevenshteinDistance(adaptorPart, nucleotidesPart)\n+ else:\n+ score = Utils.getHammingDistance(adaptorPart, nucleotidesPart)\n+ if score <= int(options.errors / 100.0 * len(adaptorPart)) and score < bestScore:\n+ bestScore = score\n+ bestRegion = i\n+ found = True\n+ if found:\n+ nbFound5p += 1\n+ sequence.shrinkToLastNucleotides(len(nucleotides) - bestRegion)\n+ elif options.noAdaptor5p:\n+ writer5pNoAdaptor.addSequence(sequence)\n+ writer.addSequence(sequence)\n+ progress.done()\n+ writer.close()\n+\n+ print "%d sequences" % (parser.getNbSequences())\n+ if options.fivePAdaptor != None:\n+ print "%d sequences with 5\' adaptors (%.2f%%)" % (nbFound5p, float(nbFound5p) / parser.getNbSequences() * 100)\n+ if options.threePAdaptor != None:\n+ print "%d sequences with 3\' adaptors (%.2f%%)" % (nbFound3p, float(nbFound3p) / parser.getNbSequences() * 100)\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/CleanTranscriptFile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CleanTranscriptFile.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,73 @@ +<tool id="CleanTranscriptFile" name="clean Transcript File"> + <description>Clean a transcript file so that it is useable for S-MART.</description> + <command interpreter="python"> ../Java/Python/CleanTranscriptFile.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #end if + #if $optionType.type == 'Yes': + -t $optionType.value + #end if + -o $outputFile + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="gff">gff</option> + <option value="gtf">gtf</option> + <option value="gff3">gff3</option> + </param> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + </conditional> + + <conditional name="optionType"> + + <param name="type" type="select" label="You can choose the tag that you are interested in, like tRNA,rRNA,ncRNA,CDS,exon, etc." help="Name of the types you want to keep in GFF/GTF (list separated by commas)"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="value" type="text" value="tRNA,rRNA,ncRNA,CDS,exon"/> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + + <outputs> + <data name="outputFile" format="gtf"> + <change_format> + <when input="formatType.FormatInputFileName" value="gff" format="gff" /> + <when input="formatType.FormatInputFileName" value="gff3" format="gff3" /> + </change_format> + </data> + + </outputs> +<tests> + <test> + <param name="FormatInputFileName" value="gtf" /> + <param name="inputFileName" value="genes.gtf" /> + <param name="type" value="No" /> + <output name="outputFile" file="exp_cleantranscriptfile_genes.gtf" /> + </test> + </tests> + + <help> + A GFF/GTF file (please consult http://www.sequenceontology.org/gff3.shtml to know more about the GFF3 format, and http://mblab.wustl.edu/GTF22.html for the GTF format) may contain different sources of information: chromosome size, genes, transcripts, etc. S-MART mostly works on transcripts. This scripts filters the input file to keep the information you really want, based on the feature (3rd column). + </help> + +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/Clusterize.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/Clusterize.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,70 @@ +<tool id="MergingDataClusterize" name="clusterize"> + <description>Clusterize features when their genomic intervals overlap.</description> + <command interpreter="python"> + ../Java/Python/clusterize.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -o $outputFileGff + $colinear + $normalize + -d $distance + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <param name="colinear" type="boolean" truevalue="-c" falsevalue="" checked="false" label="Only merge collinear features"/> + <param name="normalize" type="boolean" truevalue="-n" falsevalue="" checked="false" label="Normalize counts" help="Only works if the nbOccurrences tag is set."/> + <param name="distance" type="text" value="0" label="merge features if their relative distance is within N nt"/> + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3"/> + </outputs> + + <help> +The script clusterizes the input genomic data. Two features are clusterized when their genomic intervals overlap. The output is a GFF3 file, where each element is a cluster. The number of elements in the cluster is given by the tag **nbElements**. The name of a cluster is the concatation of the names of its reads (like **read1--read2--read3**). Note that if the size of the name of the cluster exceeds 100 characters, it is truncated to the first 100 characters. + +Some options may clusterize the features which are closer than a given distance. + +By default, the tool clusterizes all features which overlap (or nearly overlap), even if they are on different strands. If you want to clusterize the features which are on the same strand only, you can specify it. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/CollapseReads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CollapseReads.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,64 @@ +<tool id="collapseReads" name="collapse reads"> + <description>Merges two genomic features if they have exactly the same genomic coordinates.</description> + <command interpreter="python"> + ../Java/Python/CollapseReads.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -$strand + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <param name="strand" type="boolean" truevalue="-s" falsevalue="" checked="false" label="Merges features even if they are on different strands."/> + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3"/> + </outputs> + + <help> +Merge two input genomic coordinates iff they are exactly the same. If two or more genomic coordinates are merged, the tag **nbElements** is updated accordingly. As a consequence, all the reads which are exactly the same appear as one genomic coordinate. + +This is especially useful for short RNA sequencing (where you want to count the number of read per miRNA, siRNA, etc.) or 5' capped short reads. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/CompareOverlappingSmallQuery.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CompareOverlappingSmallQuery.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
b'@@ -0,0 +1,200 @@\n+<tool id="CompareOverlappingSmallQuery" name="compare Overlapping Small Query">\n+\t<description>Provide the queries that overlap with a reference, when the query data set is small.</description> \n+\t<command interpreter="python">\n+\t\t../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 \n+\t\t#if $formatType.FormatInputFileName1 == \'bed\': \n+\t\t-f bed\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff\':\n+\t\t\t-f gff\t\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff2\':\n+\t\t\t-f gff2\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff3\':\n+\t\t\t-f gff3\n+\t\t#elif $formatType.FormatInputFileName1 == \'sam\':\n+\t\t\t-f sam\n+\t\t#elif $formatType.FormatInputFileName1 == \'gtf\':\n+\t\t\t-f gtf\n+\t\t#end if\n+\t\t-j $formatType2.inputFileName2\n+\t\t#if $formatType2.FormatInputFileName2 == \'bed\':\n+\t\t\t-g bed\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff\':\n+\t\t\t-g gff\t\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff2\':\n+\t\t\t-g gff2\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff3\':\n+\t\t\t-g gff3\n+\t\t#elif $formatType2.FormatInputFileName2 == \'sam\':\n+\t\t\t-g sam\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gtf\':\n+\t\t -g gtf\n+\t\t#end if\n+\t\t-o $outputFileGff \n+\t\t#if $OptionDistance.Dist == \'Yes\':\n+\t\t\t-d $OptionDistance.distance\n+\t\t#end if\n+\t\t#if $OptionMinOverlap.present == \'Yes\':\n+\t\t\t-m $OptionMinOverlap.minOverlap\n+\t\t#end if\n+\t\t#if $OptionPcOverlapQuery.present == \'Yes\':\n+\t\t\t-p $OptionPcOverlapQuery.minOverlap\n+\t\t#end if\n+\t\t#if $OptionPcOverlapRef.present == \'Yes\':\n+\t\t\t-P $OptionPcOverlapRef.minOverlap\n+\t\t#end if\n+\t\t#if $OptionCollinearOrAntiSens.OptionCA == \'Collinear\':\n+\t\t\t-c \n+\t\t#elif $OptionCollinearOrAntiSens.OptionCA == \'AntiSens\':\n+\t\t\t-a\n+\t\t#end if\t\n+\t\t$InvertMatch\n+\t\t$NotOverlapping\n+\t\t$OptionInclusionQuery\n+\t\t$OptionInclusionRef\n+\t</command>\n+\n+\t<inputs>\n+\t\t<conditional name="formatType">\n+\t\t\t<param name="FormatInputFileName1" type="select" label="Input Query File Format">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName1" format="bed" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName1" format="gff" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName1" format="gff2" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName1" format="gff3" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName1" format="sam" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName1" format="gtf" type="data" label="Input File 1"/>\n+\t\t\t\t\t\t\t\t </when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="formatType2">\n+\t\t\t<param name="FormatInputFileName2" type="select" label="Input Reference File Format">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName2" format="bed" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName2" format="gff" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName2" format="gff2" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName2" format="gff3" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName2" format="sam" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName2" format="gtf" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t<'..b'\t\t\t</when>\n+\t\t</conditional>\n+\t\t<conditional name="OptionMinOverlap">\n+\t\t\t<param name="present" type="select" label="Minimum number of common nucleotides to declare an overlap">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<conditional name="OptionPcOverlapQuery">\n+\t\t\t<param name="present" type="select" label="N% of the query must overlap">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<conditional name="OptionPcOverlapRef">\n+\t\t\t<param name="present" type="select" label="N% of the reference must overlap">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<param name="OptionInclusionQuery" type="boolean" truevalue="-k" falsevalue="" checked="false" label="The query must be nested in a query"/>\n+\t\t<param name="OptionInclusionRef" type="boolean" truevalue="-K" falsevalue="" checked="false" label="The reference must be nested in a query"/>\n+\t\t<conditional name="OptionCollinearOrAntiSens">\n+\t\t\t<param name="OptionCA" type="select" label="Collinear or anti-sens">\n+\t\t\t\t<option value="Collinear">Collinear</option>\n+\t\t\t\t<option value="AntiSens">AntiSens</option>\n+\t\t\t\t<option value="NONE" selected="true">NONE</option>\n+\t\t\t</param>\n+\t\t\t<when value="Collinear">\n+\t\t\t</when>\n+\t\t\t<when value="AntiSens">\n+\t\t\t</when>\n+\t\t\t<when value="NONE">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<param name="InvertMatch" type="boolean" truevalue="-x" falsevalue="" checked="false" label="Invert match: the output file will contain all query elements which do NOT overlap"/>\n+\t\t<param name="NotOverlapping" type="boolean" truevalue="-O" falsevalue="" checked="false" label="Also report the query data which do not overlap, with the nbOverlaps tag set to 0."/>\n+\t</inputs>\n+\n+\t<outputs>\n+\t\t<data name="outputFileGff" format="gff3"/>\n+\t</outputs> \n+\n+\t<help>\n+This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example).\n+ \n+It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one.\n+\n+Various modifiers are also available:\n+\n+-Invert selection (report those which do not overlap).\n+\n+-Restrict to colinear / anti-sense overlapping data.\n+\n+-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data.\n+\n+-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution.\n+\n+The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream.\n+\n+Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap.\n+\t</help>\n+</tool>\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/CompareOverlappingSmallRef.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CompareOverlappingSmallRef.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
b'@@ -0,0 +1,200 @@\n+<tool id="CompareOverlappingSmallRef" name="compare Overlapping Small Reference">\n+\t<description>Provide the queries that overlap with a reference, when the reference dataset is small.</description> \n+\t<command interpreter="python">\n+\t\t../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 \n+\t\t#if $formatType.FormatInputFileName1 == \'bed\': \n+\t\t-f bed\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff\':\n+\t\t\t-f gff\t\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff2\':\n+\t\t\t-f gff2\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff3\':\n+\t\t\t-f gff3\n+\t\t#elif $formatType.FormatInputFileName1 == \'sam\':\n+\t\t\t-f sam\n+\t\t#elif $formatType.FormatInputFileName1 == \'gtf\':\n+\t\t\t-f gtf\n+\t\t#end if\n+\t\t-j $formatType2.inputFileName2\n+\t\t#if $formatType2.FormatInputFileName2 == \'bed\':\n+\t\t\t-g bed\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff\':\n+\t\t\t-g gff\t\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff2\':\n+\t\t\t-g gff2\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff3\':\n+\t\t\t-g gff3\n+\t\t#elif $formatType2.FormatInputFileName2 == \'sam\':\n+\t\t\t-g sam\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gtf\':\n+\t\t -g gtf\n+\t\t#end if\n+\t\t-o $outputFileGff \n+\t\t#if $OptionDistance.Dist == \'Yes\':\n+\t\t\t-d $OptionDistance.distance\n+\t\t#end if\n+\t\t#if $OptionMinOverlap.present == \'Yes\':\n+\t\t\t-m $OptionMinOverlap.minOverlap\n+\t\t#end if\n+\t\t#if $OptionPcOverlapQuery.present == \'Yes\':\n+\t\t\t-p $OptionPcOverlapQuery.minOverlap\n+\t\t#end if\n+\t\t#if $OptionPcOverlapRef.present == \'Yes\':\n+\t\t\t-P $OptionPcOverlapRef.minOverlap\n+\t\t#end if\n+\t\t#if $OptionCollinearOrAntiSens.OptionCA == \'Collinear\':\n+\t\t\t-c \n+\t\t#elif $OptionCollinearOrAntiSens.OptionCA == \'AntiSens\':\n+\t\t\t-a\n+\t\t#end if\t\n+\t\t$InvertMatch\n+\t\t$NotOverlapping\n+\t\t$OptionInclusionQuery\n+\t\t$OptionInclusionRef\n+\t</command>\n+\n+\t<inputs>\n+\t\t<conditional name="formatType">\n+\t\t\t<param name="FormatInputFileName1" type="select" label="Input Query File Format">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName1" format="bed" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName1" format="gff" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName1" format="gff2" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName1" format="gff3" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName1" format="sam" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName1" format="gtf" type="data" label="Input File 1"/>\n+\t\t\t\t\t\t\t\t </when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="formatType2">\n+\t\t\t<param name="FormatInputFileName2" type="select" label="Input Reference File Format">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName2" format="bed" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName2" format="gff" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName2" format="gff2" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName2" format="gff3" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName2" format="sam" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName2" format="gtf" type="data" label="Input File 2"/>\n+\t\t\t</when>'..b'\t\t\t</when>\n+\t\t</conditional>\n+\t\t<conditional name="OptionMinOverlap">\n+\t\t\t<param name="present" type="select" label="Minimum number of common nucleotides to declare an overlap">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<conditional name="OptionPcOverlapQuery">\n+\t\t\t<param name="present" type="select" label="N% of the query must overlap">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<conditional name="OptionPcOverlapRef">\n+\t\t\t<param name="present" type="select" label="N% of the reference must overlap">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<param name="OptionInclusionQuery" type="boolean" truevalue="-k" falsevalue="" checked="false" label="The query must be nested in a query"/>\n+\t\t<param name="OptionInclusionRef" type="boolean" truevalue="-K" falsevalue="" checked="false" label="The reference must be nested in a query"/>\n+\t\t<conditional name="OptionCollinearOrAntiSens">\n+\t\t\t<param name="OptionCA" type="select" label="Collinear or anti-sens">\n+\t\t\t\t<option value="Collinear">Collinear</option>\n+\t\t\t\t<option value="AntiSens">AntiSens</option>\n+\t\t\t\t<option value="NONE" selected="true">NONE</option>\n+\t\t\t</param>\n+\t\t\t<when value="Collinear">\n+\t\t\t</when>\n+\t\t\t<when value="AntiSens">\n+\t\t\t</when>\n+\t\t\t<when value="NONE">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<param name="InvertMatch" type="boolean" truevalue="-x" falsevalue="" checked="false" label="Invert match: the output file will contain all query elements which do NOT overlap"/>\n+\t\t<param name="NotOverlapping" type="boolean" truevalue="-O" falsevalue="" checked="false" label="Also report the query data which do not overlap, with the nbOverlaps tag set to 0."/>\n+\t</inputs>\n+\n+\t<outputs>\n+\t\t<data name="outputFileGff" format="gff3"/>\n+\t</outputs> \n+\n+\t<help>\n+This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example).\n+ \n+It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one.\n+\n+Various modifiers are also available:\n+\n+-Invert selection (report those which do not overlap).\n+\n+-Restrict to colinear / anti-sense overlapping data.\n+\n+-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data.\n+\n+-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution.\n+\n+The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream.\n+\n+Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap.\n+\t</help>\n+</tool>\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/ConvertTranscriptFile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,95 @@ +<tool id="ConvertTranscriptFile" name="convert transcript file"> + <description>Convert a file from a format to another.</description> + <command interpreter="python"> ../Java/Python/convertTranscriptFile.py -i $inputFormatType.inputFileName + #if $inputFormatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $inputFormatType.FormatInputFileName == 'bed': + -f bed + #elif $inputFormatType.FormatInputFileName == 'bam': + -f blast + #elif $inputFormatType.FormatInputFileName == 'sam': + -f sam + #elif $inputFormatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -g $outputFormatType.outFormat + + -n $name + $strand + -o $outputFile + + </command> + <inputs> + <conditional name="inputFormatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="gff3">GFF3</option> + <option value="bed">BED</option> + <option value="bam">BAM</option> + <option value="sam">SAM</option> + <option value="gtf">GTF</option> + </param> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="bam"> + <param name="inputFileName" format="bam" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + + <conditional name="outputFormatType"> + <param name="outFormat" type="select" label="Please choose the format that you want to convert to (corresponding to your input file format)."> + <option value="gff3">GFF3</option> + <option value="bed">BED</option> + <option value="wig">WIG</option> + <option value="sam">SAM</option> + <option value="csv">CSV</option> + <option value="gtf">GTF</option> + </param> + <when value="gff3"> + </when> + <when value="bed"> + </when> + <when value="wig"> + </when> + <when value="sam"> + </when> + <when value="csv"> + </when> + <when value="gtf"> + </when> + </conditional> + + <param name="name" type="text" value="SMART" label="name for the transcripts"/> + + <param name="strand" type="boolean" truevalue="-t" falsevalue="" checked="false" label="consider the 2 strands as different (only useful for writing WIG files)"/> + + </inputs> + + <outputs> + <data name="outputFile" format="gff3" label="$inputFormatType.FormatInputFileName to $outputFormatType.outFormat"> + <change_format> + <when input="outputFormatType.outFormat" value="bed" format="bed" /> + <when input="outputFormatType.outFormat" value="gff" format="gff" /> + <when input="outputFormatType.outFormat" value="wig" format="wig" /> + <when input="outputFormatType.outFormat" value="sam" format="sam" /> + <when input="outputFormatType.outFormat" value="csv" format="csv" /> + <when input="outputFormatType.outFormat" value="gtf" format="gtf" /> + </change_format> + </data> + </outputs> + + <help> +Simple conversion tool. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/CountReadGCPercent.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CountReadGCPercent.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,17 @@ +<tool id="CountReadGCPercent" name="count read GCpercent"> + <description>Count GC percent for each read against a genome.</description> + <command interpreter="python"> ../Java/Python/CountReadGCPercent.py -i $inputFastaFile -j $inputGffFile -o $outputFile</command> + <inputs> + <param name="inputFastaFile" type="data" label="Input reference fasta File" format="fasta"/> + <param name="inputGffFile" type="data" label="Input File" format="gff3"/> + </inputs> + + <outputs> + <data format="gff3" name="outputFile" label="[CountReadGCPercent] Output File"/> + </outputs> + + <help> +Count the GC% of a FASTA file. + </help> +</tool> + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/GetDifferentialExpression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/GetDifferentialExpression.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,211 @@\n+<tool id="GetDifferentialExpression" name="get differential expression">\n+\t<description>Get the differential expression between 2 conditions using Fisher\'s exact test, on regions defined by a third file.</description>\n+\t<command interpreter="python">\n+\t\t../Java/Python/GetDifferentialExpression.py -i $formatType.inputFileName1\n+\t\t#if $formatType.FormatInputFileName1 == \'bed\':\n+\t\t\t-f bed\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff\':\n+\t\t\t-f gff\t\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff2\':\n+\t\t\t-f gff2\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff3\':\n+\t\t\t-f gff3\n+\t\t#elif $formatType.FormatInputFileName1 == \'sam\':\n+\t\t\t-f sam\n+\t\t#elif $formatType.FormatInputFileName1 == \'gtf\':\n+\t\t\t-f gtf\n+\t\t#end if\n+\t\t\t\n+\t\t-j $formatType2.inputFileName2\n+\t\t#if $formatType2.FormatInputFileName2 == \'bed\':\n+\t\t\t-g bed\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff\':\n+\t\t\t-g gff\t\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff2\':\n+\t\t\t-g gff2\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff3\':\n+\t\t\t-g gff3\n+\t\t#elif $formatType2.FormatInputFileName2 == \'sam\':\n+\t\t\t-g sam\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gtf\':\n+\t\t\t-g gtf\n+\t\t#end if\n+\t\t\t\n+\t\t-k $formatTypeRef.inputFileNameRef\n+\t\t#if $formatTypeRef.FormatInputFileNameRef == \'bed\':\n+\t\t\t-l bed\n+\t\t#elif $formatTypeRef.FormatInputFileNameRef == \'gff\':\n+\t\t\t-l gff\t\n+\t\t#elif $formatTypeRef.FormatInputFileNameRef == \'gff2\':\n+\t\t\t-l gff2\n+\t\t#elif $formatTypeRef.FormatInputFileNameRef == \'gff3\':\n+\t\t\t-l gff3\n+\t\t#elif $formatTypeRef.FormatInputFileNameRef == \'sam\':\n+\t\t\t-l sam\n+\t\t#elif $formatTypeRef.FormatInputFileNameRef == \'gtf\':\n+\t\t\t-l gtf\n+\t\t#end if\n+\t\t\n+\t\t-o $outputFileGff \n+\t\t\n+\t\t$simple\n+\t\t$adjusted\n+\n+\t\t#if $optionSimplePara.simplePara == \'Yes\':\n+\t\t\t-S $optionSimplePara.paraValue\n+\t\t#end if\t\t\n+\t\t\n+\t\t#if $optionFixedSizeFactor.FSF == \'Yes\':\n+\t\t\t-x $optionFixedSizeFactor.FSFValue\n+\t\t#end if\n+\t\t\n+\t\t#if $optionFDR.FDR == \'Yes\':\n+\t\t\t-d $optionFDR.FDRValue\n+\t\t#end if\n+\t</command>\n+\n+\t<inputs>\n+\t\t<conditional name="formatType">\n+\t\t\t<param name="FormatInputFileName1" type="select" label="Input File Format 1">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName1" format="bed" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName1" format="gff" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName1" format="gff2" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName1" format="gff3" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName1" format="sam" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName1" format="gtf" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="formatType2">\n+\t\t\t<param name="FormatInputFileName2" type="select" label="Input File Format 2">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName2" format="bed" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName2" format="gff" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName2" format="gff2" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName2" format="gff3" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName2" format="sam" type="data" label'..b'n value="gtf">\n+\t\t\t\t<param name="inputFileNameRef" format="gtf" type="data" label="Input Ref File"/>\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<param name="simple" type="boolean" truevalue="-s" falsevalue="" checked="false" label="Normalize using the number of reads in each condition"/>\n+\t\t<param name="adjusted" type="boolean" truevalue="-a" falsevalue="" checked="false" label="Normalize using the number of reads of interquartile expression region"/>\n+\t\t\n+\t\t<conditional name="optionSimplePara">\n+\t\t\t<param name="simplePara" type="select" label="provide the number of reads" >\n+\t\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="paraValue" type="text" value="None" label="provide the number of reads" />\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optionFixedSizeFactor">\n+\t\t\t<param name="FSF" type="select" label="Give the magnification factor for the normalization using fixed size sliding windows in reference regions (leave empty for no such normalization)">\n+\t\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="FSFValue" type="integer" value="0" />\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t\n+\t\t<conditional name="optionFDR">\n+\t\t\t<param name="FDR" type="select" label="use FDR">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="FDRValue" type="float" value="0.0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t\n+\t</inputs>\n+\n+\t<outputs>\n+\t\t<data name="outputFileGff" format="gff3" label="[GetDifferentialExpression]out file"/>\n+\t</outputs> \n+\t\n+\t<help>\n+This tool compares two sets of data and find the differential expression. One very important component of the tool is the reference set. Actually, to use the tool, you need the two input sets of data, of course, and the reference set. The reference set is a set of genomic coordinates and, for each interval, it will count the number of feature on each sample and compute the differential expression. For each reference interval, it will output the direction of the regulation (up or down, with respect to the first input set), and a *p*-value from a Fisher exact test.\n+\n+This reference set seems boring. Why not computing the differential expression without this set? The answer is: the differential expression of what? I cannot guess it. Actually, you might want to compare the expression of genes, of small RNAs, of transposable elements, of anything... So the reference set can be a list of genes, and in this case, you can compute the differential expression of genes. But you can also compute many other things.\n+\n+Suppose that you cluster the data of your two input samples (you can do it with the *clusterize* and the *mergeTranscriptLists* tools). You now have a list of all the regions which are transcribed in at least one of the input samples. This can be your reference set. This reference set is interesting since you can detect the differential expression of data which is outside any annotation.\n+\n+Suppose now that you clusterize using a sliding window the two input samples (you can do it with the *clusterizeBySlidingWindows* and the *mergeSlidingWindowsClusters* tools). You can now select all the regions of a given size which contain at least one read in one of the two input samples (do it with *selectByTag* and the tag **nbElements**). Again, this can be an other interesting reference set.\n+\n+In most cases, the sizes of the two input samples will be different, so you should probably normalize the data, which is an available option. The ---rather crude--- normalization increases the number of data in the least populated sample and decreases the number of data in the most populated sample to the average number of data.\n+\t</help>\n+</tool>\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/GetFlanking.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/GetFlanking.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,191 @@ +<tool id="GetFlanking" name="get flanking"> + <description>Get the flanking regions of a set of reference.</description> + <command interpreter="python"> + ../Java/Python/GetFlanking.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + #if $OptionUpDownStream.OptionUD == 'UpStream': + -5 + #elif $OptionUpDownStream.OptionUD == 'DownStream': + -3 + #end if + + + #if $OptionColinearOrAntiSens.OptionCA == 'Colinear': + -c + #elif $OptionColinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + + #if $OptionMax.maximum == "Yes": + -D $OptionMax.max + #end if + #if $OptionMin.minimum == "Yes": + -d $OptionMin.min + #end if + + -o $outputFile + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="query File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File 1"/> + </when> + <when value="gff"> + <param name="inputFileName1" format="gff" type="data" label="Input File 1"/> + </when> + <when value="gff2"> + <param name="inputFileName1" format="gff2" type="data" label="Input File 1"/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File 1"/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File 1"/> + </when> + <when value="gtf"> + <param name="inputFileName1" format="gtf" type="data" label="Input File 1"/> + </when> + </conditional> + + <conditional name="formatType2"> + <param name="FormatInputFileName2" type="select" label="Refence File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName2" format="bed" type="data" label="Input File 2"/> + </when> + <when value="gff"> + <param name="inputFileName2" format="gff" type="data" label="Input File 2"/> + </when> + <when value="gff2"> + <param name="inputFileName2" format="gff2" type="data" label="Input File 2"/> + </when> + <when value="gff3"> + <param name="inputFileName2" format="gff3" type="data" label="Input File 2"/> + </when> + <when value="sam"> + <param name="inputFileName2" format="sam" type="data" label="Input File 2"/> + </when> + <when value="gtf"> + <param name="inputFileName2" format="gtf" type="data" label="Input File 2"/> + </when> + </conditional> + + <conditional name="OptionUpDownStream"> + <param name="OptionUD" type="select" label="Only provide upstream/dowstream features"> + <option value="UpStream">UpStream</option> + <option value="DownStream">DownStream</option> + <option value="NONE" selected="true">NONE</option> + </param> + <when value="UpStream"> + </when> + <when value="DownStream"> + </when> + <when value="NONE"> + </when> + </conditional> + + <conditional name="OptionColinearOrAntiSens"> + <param name="OptionCA" type="select" label="Only provide collinear/antisens features"> + <option value="Colinear">Collinear</option> + <option value="AntiSens">AntiSens</option> + <option value="NONE" selected="true">NONE</option> + </param> + <when value="Colinear"> + </when> + <when value="AntiSens"> + </when> + <when value="NONE"> + </when> + </conditional> + + <conditional name="OptionMax"> + <param name="maximum" type="select" label="maximum distance between 2 elements"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="max" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionMin"> + <param name="minimum" type="select" label="minimum distance between 2 elements"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="min" type="integer" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + + <outputs> + <data format="gff3" name="outputFile" label="[GetFlanking] Output File"/> + </outputs> + + <help> +This tool prints the elements from the second set of genomic intervals which are closest to (in other words, are flanking) the elements from the first set. You can also play on different parameters: + +- restrict the search to downstream or upstream elements, or print downstream and upstream elements, + +- only consider collinear flanking elements, + +- only consider anti-sense flanking elements, + +- only consider elements which are close enough (using some given distance), + +- only consider flanking elements which do not overlap with the reference element. + +Notice that elements from the second sets may be printed at most once, whether they are the flanking element of several elements from the first or not. + </help> + +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/SelectByTag.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/SelectByTag.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,119 @@ +<tool id="SelectByTag" name="select by tag"> + <description>Keep the genomic coordinates such that a value of a given tag.</description> + <command interpreter="python"> + ../Java/Python/SelectByTag.py -i $formatType.inputFileName + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -g $Tag + #if $OptionValue.Value == "Yes": + -a $OptionValue.valeur + #end if + #if $OptionMax.maximum == "Yes": + -M $OptionMax.max + #end if + #if $OptionMin.minimum == "Yes": + -m $OptionMin.min + #end if + + #if $OptionDefault.default == "Yes": + -d $OptionDefault.defaultValue + #end if + + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="gtf">gtf</option> + </param> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <param name="Tag" type="text" value="None" label="tag option" help="A given tag, you must choose a tag."/> + + <conditional name="OptionValue"> + <param name="Value" type="select" label="given value for the tag"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="valeur" type="integer" value="1"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionMax"> + <param name="maximum" type="select" label="maximum value of tag"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="max" type="integer" value="1"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionMin"> + <param name="minimum" type="select" label="minimum value of tag"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="min" type="integer" value="1"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionDefault"> + <param name="default" type="select" label="give this value if tag is not present"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="defaultValue" type="float" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3" label="[SelectByTag] Output File"/> + </outputs> + + <help> +The script reads a list of genomic coordinates and output all the features with specific tag values. If you want to know more about tags, please consult the GFF format page: http://www.sequenceontology.org/gff3.shtml + +The tools reads the input file, and more specifically the tag that you specified. You can mention a lower and a upper bound for its value, or a specific value, and the tool will print all the features such that the tags are between the specified bounds or matches the string. + +A tag has to be present for each feature. If not, you can specify a default value which will be used if the tag is absent. + +This tool can be used to select the clusters with a minimum number of elements (the tag **nbElements** counts the number of elements per clusters) or to select the reads which have mapped less than *n* times (the tag **nbOccurrences** counts the number of mappings per read). + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/WrappGetLetterDistribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/WrappGetLetterDistribution.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,45 @@ +<tool id="getLetterDistribution1" name="get letter distribution"> + <description>Calculate distribution for each nucleotide per position for all short reads</description> + <command interpreter="python"> + WrappGetLetterDistribution.py -i $inputFileName + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #else : + -f fastq + #end if + -c $ouputFileNameCSV -a $ouputFileNamePNG1 -b $ouputFileNamePNG2 + </command> + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="fasta">fasta</option> + <option value="fastq" selected="true">fastq</option> + </param> + <when value="fasta"> + <param name="inputFileName" format="fasta" type="data" label="Fasta Input File"/> + </when> + <when value="fastq"> + <param name="inputFileName" format="fastq" type="data" label="Fastq Input File"/> + </when> + </conditional> + </inputs> + + <outputs> + <data name="ouputFileNameCSV" format="tabular" label="[getLetterDistribution] CSV File"/> + <data name="ouputFileNamePNG1" format="png" label="[getLetterDistribution] PNG File 1"/> + <data name="ouputFileNamePNG2" format="png" label="[getLetterDistribution] PNG File 2"/> + </outputs> + <tests> + <test> + <param name="FormatInputFileName" value="fastq" /> + <param name="inputFileName" value="short_fastq.fastq" /> + <output name="outputFileNameCSV" file="exp_getletterdistribution_short_fastq.csv" /> + </test> + </tests> + + <help> +The script gets the nucleotide distribution of the input sequence list. It outputs two files. The first file shows the nucleotide distribution of the data. More precisely, a point (*x*, *y*) on the curve **A** shows that *y* sequences have *x* % of **A**. + +The second plot shows the average nucleotide distribution for each position of the read. You can use it to detect a bias in the first nucleotides, for instance. A point *x*, *y* on the curve **A** shows that at the position *x*, there are *y*% of **A**. A point (*x*, *y*) on the curve **#** tells you that *y* % of the sequences contain not less than *x* nucleotides. By definition, this latter line is a decreasing function. It usually explains why the tail of the other curves are sometimes erratic: there are few sequences. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/changeGffFeatures.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/changeGffFeatures.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,20 @@ +<tool id="changeGffFeatures" name="change gff Features"> + <description>Change a feature in a GFF file (the feature is the 3rd column).</description> + <command interpreter="bash"> + ../Java/Python/changeGffFeatures.sh $inputFile $inputFeature $outputFeature >$outputFile + </command> + <inputs> + <param name="inputFile" type="data" label="Input File" format="gff"/> + <param name="inputFeature" type="text" value="exon" label="The feature you want to change"/> + <param name="outputFeature" type="text" value="exon" label="The new feature"/> + </inputs> + + <outputs> + <data name="outputFile" format="gff" label="[changeGffFeatures] Output File"/> + </outputs> + + <help> + This script changes the third column of a GFF3 file (please refer to http://www.sequenceontology.org/gff3.shtml to know more about this format). + </help> +</tool> + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/changeTagName.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/changeTagName.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,48 @@ +<tool id="changeTagName" name="change tag name"> + <description>Change the name of a tag in a GFF file.</description> + <command interpreter="python"> + ../Java/Python/changeTagName.py -i $formatType.inputFileName + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #end if + + -t $Tag + -n $name + + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + </param> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + </conditional> + + <param name="Tag" type="text" label="tag option" help="The tag you want to change"/> + <param name="name" type="text" label="name option" help="A new name for the tag"/> + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3" label="[changeTagName] Output File"/> + </outputs> + + <help> + Change the name of a tag in the 9th field of a GFF3 file (please consult http://www.sequenceontology.org/gff3.shtml to know more about this format). + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/clusterizeBySlidingWindows.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/clusterizeBySlidingWindows.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,138 @@ +<tool id="clusterizeBySlidingWindows" name="clusterize By SlidingWindows"> + <description>Produces a GFF3 file that clusters a list of transcripts using a sliding window. Cluster the data into regions (defined by size and overlap with next region).</description> + <command interpreter="python"> + ../Java/Python/clusterizeBySlidingWindows.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -s $size + -e $overlap + -o $outputFileGff + $normalize + $strands + + #if $OptionTag.tag == "Yes": + -g $OptionTag.value + #end if + + #if $OptionsOperation.operation == "Yes": + -r $OptionsOperation.value + #end if + + #if $OptionWriteTag.writeTag == "Yes": + -w $OptionWriteTag.value + #end if + + $strand + $plot $plotPng + $excel $excelOutput + + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + + <param name="size" type="text" value="50000" label="Size option" help="Size of the regions."/> + <param name="overlap" type="text" value="50" label="Overlap option" help="Overlap between two consecutive regions."/> + <param name="normalize" type="boolean" truevalue="-m" falsevalue="" checked="false" label="Normalize option for only GFF3 file format" help="(only work if the tag nbOccurrences is set)"/> + <param name="strands" type="boolean" truevalue="-2" falsevalue="" checked="false" label="Consider the two strands separately"/> + + <conditional name="OptionTag"> + <param name="tag" type="select" label="Use a given tag as input (instead of summing number of features)"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="value" type="select" label="tag name"/> + </when> + <when value="No"> + </when> + </conditional> + + + <conditional name="OptionsOperation"> + <param name="operation" type="select" label="combine tag value with given operation"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="value" type="select" label="operation" help="You can ONLY choose one of following operation : sum, avg, med, min, max."> + <option value="sum">sum</option> + <option value="avg">average</option> + <option value="med">median</option> + <option value="min">minimum</option> + <option value="max">maximum</option> + </param> + </when> + <when value="No"> + </when> + </conditional> + + + <conditional name="OptionWriteTag"> + <param name="writeTag" type="select" label="write a new tag in output file"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="value" type="text" value="nbElements" label="write tag option" help="print the result in the given tag (default usually is 'nbElements')"/> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3"/> + </outputs> + + <help> +Sliding windows are a convenient ways to clusterize data mapped on the genome. There are two important parameters of a sliding window: the size of the window and the size of the overlap. + +By default, sliding windows count the number of reads in each window. However, you can basically merge any information which is contained in the tags. You can compute the average, sum, median, max or min of the tags for each window. For instance, every window can contain the average cluster size, if you merge clusters instead of reads. + +The output file is a GFF3 file, where each element is a window. There is a special tag for each window, whose name is **nbElements** if you counted the number of transcripts per sliding window. However, if you performed a **min** (resp. **max**, **sum**, **median**, **average**) operation on the tags **value** of the transcripts, then the tag of the window will be **minValue** (resp. **maxValue**, **sumValue**, **medValue**, **avgValue**). You can also specify the name of your tag (which is actually advised: **nbReadsInSample1** will always be more informative than **nbElements**). + +You also have different option, which can select the *n* % highest regions, or the regions with at least *n* features in it, or even the regions with at least *n* unique features. This last option is useful when you want to cluster the reads which have mapped only once, for instance. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/compareOverlapping.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/compareOverlapping.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
b'@@ -0,0 +1,307 @@\n+<tool id="CompareOverlapping" name="compare Overlapping">\n+\t<description>Print all the transcripts from a first file which overlap with the transcripts from a second file.</description>\n+\t<command interpreter="python">\n+\t\t../Java/Python/CompareOverlapping.py -i $formatType.inputFileName1\n+\t\t#if $formatType.FormatInputFileName1 == \'bed\':\n+\t\t\t-f bed\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff\':\n+\t\t\t-f gff\t\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff2\':\n+\t\t\t-f gff2\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff3\':\n+\t\t\t-f gff3\n+\t\t#elif $formatType.FormatInputFileName1 == \'sam\':\n+\t\t\t-f sam\n+\t\t#elif $formatType.FormatInputFileName1 == \'gtf\':\n+\t\t\t-f gtf\n+\t\t#end if\n+\t\t\t\n+\t\t-j $formatType2.inputFileName2\n+\t\t#if $formatType2.FormatInputFileName2 == \'bed\':\n+\t\t\t-g bed\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff\':\n+\t\t\t-g gff\t\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff2\':\n+\t\t\t-g gff2\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff3\':\n+\t\t\t-g gff3\n+\t\t#elif $formatType2.FormatInputFileName2 == \'sam\':\n+\t\t\t-g sam\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gtf\':\n+\t\t -g gtf\n+\t\t#end if\n+\n+\t\t-o $outputFileGff \n+\n+\t\t#if $optionNFirstFile1.NFirstForFile1 == \'Yes\':\n+\t\t\t-S $optionNFirstFile1.firstNtFile1\n+\t\t#end if\n+\t\t#if $optionNFirstFile2.NFirstForFile2 == \'Yes\':\n+\t\t\t-s $optionNFirstFile2.firstNtFile2\n+\t\t#end if\n+\t\t#if $optionNLastFile1.NLastForFile1 == \'Yes\':\n+\t\t\t-U $optionNLastFile1.lastNtFile1\n+\t\t#end if\n+\t\t#if $optionNLastFile2.NLastForFile2 == \'Yes\':\n+\t\t\t-u $optionNLastFile2.lastNtFile2\n+\t\t#end if\n+\t\n+\t\t#if $optionExtentionCinqFile1.extentionFile1 == \'Yes\':\n+\t\t\t-E $optionExtentionCinqFile1.extention51\n+\t\t#end if\n+\t\t#if $optionExtentionCinqFile2.extentionFile2 == \'Yes\':\n+\t\t\t-e $optionExtentionCinqFile2.extention52\n+\t\t#end if\n+\n+\t\t#if $optionExtentionTroisFile1.extentionFile1 == \'Yes\':\n+\t\t\t-N $optionExtentionTroisFile1.extention31\n+\t\t#end if\n+\t\t#if $optionExtentionTroisFile2.extentionFile2 == \'Yes\':\n+\t\t\t-n $optionExtentionTroisFile2.extention32\n+\t\t#end if\t\n+\n+\t\t#if $OptionColinearOrAntiSens.OptionCA == \'Colinear\':\n+\t\t\t-c \n+\t\t#elif $OptionColinearOrAntiSens.OptionCA == \'AntiSens\':\n+\t\t\t-a\n+\t\t#end if\t\n+\n+\t\t#if $OptionDistance.Dist == \'Yes\':\n+\t\t\t-d $OptionDistance.distance\n+\t\t#end if\n+\n+\t\t#if $OptionMinOverlap.MO == \'Yes\':\n+\t\t\t-m $OptionMinOverlap.minOverlap\n+\t\t#end if\n+\n+\t\t$InvertMatch\n+\t\t$ReportIntron\n+\t\t$NotOverlapping\n+\t\t\n+\t</command>\n+\n+\t<inputs>\n+\t\t<conditional name="formatType">\n+\t\t\t<param name="FormatInputFileName1" type="select" label="Input File Format 1">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName1" format="bed" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName1" format="gff" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName1" format="gff2" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName1" format="gff3" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName1" format="sam" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName1" format="gtf" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="formatType2">\n+\t\t\t<param name="FormatInputFileName2" type="select" label="Input File Format 2">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName2" format="bed" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t'..b's">\n+\t\t\t\t<param name="extention32" type="integer" value="1" label="in file 2" />\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="OptionColinearOrAntiSens">\n+\t\t\t<param name="OptionCA" type="select" label="Report queries which are collinear/antisens w.r.t. a reference">\n+\t\t\t\t<option value="Colinear">Colinear</option>\n+\t\t\t\t<option value="AntiSens">AntiSens</option>\n+\t\t\t\t<option value="NONE" selected="true">NONE</option>\n+\t\t\t</param>\n+\t\t\t<when value="Colinear">\n+\t\t\t</when>\n+\t\t\t<when value="AntiSens">\n+\t\t\t</when>\n+\t\t\t<when value="NONE">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="OptionDistance">\n+\t\t\t<param name="Dist" type="select" label="Maximum Distance between two reads">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="distance" type="integer" value="0"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="OptionMinOverlap">\n+\t\t\t<param name="MO" type="select" label="Minimum number of overlapping between two reads">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minOverlap" type="integer" value="1"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t<param name="ReportIntron" type="boolean" truevalue="-t" falsevalue="" checked="false" label="Also report queries which overlap with the introns of references, or queries such that a reference is in one of its intron"/>\n+\t\t<param name="InvertMatch" type="boolean" truevalue="-x" falsevalue="" checked="false" label="Invert match: the output file will contain all query elements which do NOT overlap"/>\n+\t\t<param name="NotOverlapping" type="boolean" truevalue="-O" falsevalue="" checked="false" label="Also report the query data which do not overlap, with the nbOverlaps tag set to 0."/>\n+\t</inputs>\n+\t\t\n+\t<outputs>\n+\t\t<data name="outputFileGff" format="gff3"/>\n+\t</outputs> \n+\t\n+\t<help>\n+This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example).\n+ \n+It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one.\n+\n+Various modifiers are also available:\n+\n+-Restrict query / reference set to the first nucleotide. Useful to check if the TSS of one set overlap with the other one.\n+\n+-Extend query / reference set on the 5\' / 3\' direction. Useful to check if one set is located upstream / downstream the other one.\n+\n+-Include introns in the comparison.\n+\n+-Invert selection (report those which do not overlap).\n+\n+-Restrict to colinear / anti-sense overlapping data.\n+\n+-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data.\n+\n+-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution.\n+\n+The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream.\n+\n+Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap.\n+\t</help>\n+</tool>\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/computeCoverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/computeCoverage.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,110 @@ +<tool id="ComputeCoverage" name="compute coverage"> + <description>Compute the coverage of a set with respect to another set.</description> + <command interpreter="python"> + ../Java/Python/ComputeCoverage.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + $ReportIntron + -o $outputFileGff + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="Input File Format 1"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File 1"/> + </when> + <when value="gff"> + <param name="inputFileName1" format="gff" type="data" label="Input File 1"/> + </when> + <when value="gff2"> + <param name="inputFileName1" format="gff2" type="data" label="Input File 1"/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File 1"/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File 1"/> + </when> + <when value="gtf"> + <param name="inputFileName1" format="gtf" type="data" label="Input File 1"/> + </when> + </conditional> + + <conditional name="formatType2"> + <param name="FormatInputFileName2" type="select" label="Input File Format 2"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName2" format="bed" type="data" label="Input File 2"/> + </when> + <when value="gff"> + <param name="inputFileName2" format="gff" type="data" label="Input File 2"/> + </when> + <when value="gff2"> + <param name="inputFileName2" format="gff2" type="data" label="Input File 2"/> + </when> + <when value="gff3"> + <param name="inputFileName2" format="gff3" type="data" label="Input File 2"/> + </when> + <when value="sam"> + <param name="inputFileName2" format="sam" type="data" label="Input File 2"/> + </when> + <when value="gtf"> + <param name="inputFileName2" format="gtf" type="data" label="Input File 2"/> + </when> + </conditional> + + <param name="ReportIntron" type="boolean" truevalue="-t" falsevalue="" checked="false" label="Include introns."/> + + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3" label="[computeCoverage] OUTPUT file"/> + </outputs> + + <help> +This tool considers a query and a reference files, and gives the coverage of the query file by the reference. The output file is similar to the query file, where a tag **coverage** has been added. + </help> +</tool> + |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/coordinatesToSequence.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/coordinatesToSequence.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,65 @@ +<tool id="coordinatesToSequence" name="coordinates to sequence"> + <description>Coordinates to Sequences: Extract the sequences from a list of coordinates.</description> + <command interpreter="python"> + ../Java/Python/coordinatesToSequence.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -s $sequence + -o $outputFileFasta + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName1" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName1" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName1" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <param name="sequence" type="data" label="Reference fasta File" format="fasta"/> + + </inputs> + + <outputs> + <data name="outputFileFasta" format="fasta" label="coordinates to sequences output"/> + </outputs> + + <help> +You can use this tool, if you just want to convert your mapping data to genomic coordinates, without any filtering. It requires a genomic coordinates file together with its format, an output format (GFF3, BED, etc...), the genome, and prints you the corresponding file. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getDifference.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getDifference.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,130 @@ +<tool id="getDifference" name="get difference"> + <description>Gets all the regions of the genome, except the one given in an annotation file. Alternatively, it may also give all the elements from the first set which does not ovelap with the second set (at the nucleotide level).</description> + <command interpreter="python"> + ../Java/Python/getDifference.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + $split + + #if $OptionSequence.option == "Yes": + -s $OptionSequence.sequence + #end if + + -o $outputFileGff + + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="Input File Format 1"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File "/> + </when> + <when value="gff"> + <param name="inputFileName1" format="gff" type="data" label="Input File "/> + </when> + <when value="gff2"> + <param name="inputFileName1" format="gff2" type="data" label="Input File "/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File "/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File "/> + </when> + <when value="gtf"> + <param name="inputFileName1" format="gtf" type="data" label="Input File "/> + </when> + </conditional> + + <conditional name="formatType2"> + <param name="FormatInputFileName2" type="select" label="Input File Format 2"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName2" format="bed" type="data" label="reference file"/> + </when> + <when value="gff"> + <param name="inputFileName2" format="gff" type="data" label="reference file"/> + </when> + <when value="gff2"> + <param name="inputFileName2" format="gff2" type="data" label="reference file"/> + </when> + <when value="gff3"> + <param name="inputFileName2" format="gff3" type="data" label="reference file"/> + </when> + <when value="sam"> + <param name="inputFileName2" format="sam" type="data" label="reference file"/> + </when> + <when value="gtf"> + <param name="inputFileName2" format="gtf" type="data" label="reference file"/> + </when> + </conditional> + + <param name="split" type="boolean" truevalue="-p" falsevalue="" checked="false" label="When comparing to a set of genomic coordinates, do not join into exons."/> + + <conditional name="OptionSequence"> + <param name="option" type="select" label="Compare with a reference fasta file."> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="sequence" type="data" label="Fasta File" format="fasta"/> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + + <outputs> + <data name="outputFileGff" format="gff3" label="[getDifference]output File."/> + </outputs> + + <help> +This tools has two different (but similar) uses. When given two sets of transcripts, it trims the elements of the set so that they do not overlap with the second set. + +When only one set of transcripts is given, together with a reference genome, it produces a list of transcripts which complements the first set. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getDistance.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getDistance.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
b'@@ -0,0 +1,244 @@\n+<tool id="GetDistance" name="get distance">\n+\t<description>Give the distances between every data from the first input set with respect to the data from the second input set.</description>\n+\t<command interpreter="python">\n+\t\t../Java/Python/getDistance.py -i $formatType.inputFileName1\n+\t\t#if $formatType.FormatInputFileName1 == \'bed\':\n+\t\t\t-f bed\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff\':\n+\t\t\t-f gff\t\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff2\':\n+\t\t\t-f gff2\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff3\':\n+\t\t\t-f gff3\n+\t\t#elif $formatType.FormatInputFileName1 == \'sam\':\n+\t\t\t-f sam\n+\t\t#elif $formatType.FormatInputFileName1 == \'gtf\':\n+\t\t\t-f gtf\n+\t\t#end if\n+\t\t\t\n+\t\t-j $formatType2.inputFileName2\n+\t\t#if $formatType2.FormatInputFileName2 == \'bed\':\n+\t\t\t-g bed\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff\':\n+\t\t\t-g gff\t\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff2\':\n+\t\t\t-g gff2\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff3\':\n+\t\t\t-g gff3\n+\t\t#elif $formatType2.FormatInputFileName2 == \'sam\':\n+\t\t\t-g sam\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gtf\':\n+\t\t\t-g gtf\n+\t\t#end if\n+\n+\n+\t\t$absolute $proportion\n+\n+\t\t#if $OptionColinearOrAntiSens.OptionCA == "Colinear":\n+\t\t\t-c \n+\t\t#elif $OptionColinearOrAntiSens.OptionCA == \'AntiSens\':\n+\t\t\t-a\n+\t\t#end if\n+\n+\t\t#if $OptionMinDistance.MinD == "Yes":\n+\t\t\t-m $OptionMinDistance.minDistance\n+\t\t#end if\n+\n+\t\t#if $OptionMaxDistance.MaxD == "Yes":\n+\t\t\t-M $OptionMaxDistance.maxDistance\n+\t\t#end if\n+\n+\t\t$fivePrime $threePrime $spearMan\n+\n+\t\t#if $OptionBuckets.OBuckets == "Yes":\n+\t\t\t-u $OptionBuckets.buckets\n+\t\t#end if\n+\n+\t\t#if $OptionMinXaxis.MinX == "Yes":\n+\t\t\t-x $OptionMinXaxis.minXaxis\n+\t\t#end if\n+\n+\t\t#if $OptionMaxXaxis.MaxX == "Yes":\n+\t\t\t-X $OptionMaxXaxis.maxXaxis\n+\t\t#end if\n+\n+\t\t#if $OptionTitle.OTitle == "Yes":\n+\t\t\t-t $OptionTitle.title\n+\t\t#end if\n+\t\t\n+\t\t-o $outputFilePng\n+\t</command>\n+\n+\t<inputs>\n+\t\t<conditional name="formatType">\n+\t\t\t<param name="FormatInputFileName1" type="select" label="Input File Format 1">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName1" format="bed" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName1" format="gff" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName1" format="gff2" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName1" format="gff3" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName1" format="sam" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName1" format="gtf" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="formatType2">\n+\t\t\t<param name="FormatInputFileName2" type="select" label="Input File Format 2">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName2" format="bed" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName2" format="gff" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName2" format="gff2" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName2" format="gff3" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName2" format="sam" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName2'..b' between two features">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="maxDistance" type="integer" value="1000"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<param name="fivePrime" type="boolean" truevalue="-5" falsevalue="" checked="false" label="five prime option" help="Consider the elements from input file 1 which are upstream of elements of input file 2"/>\n+\t\t<param name="threePrime" type="boolean" truevalue="-3" falsevalue="" checked="false" label="three prime option" help="Consider the elements from input file1 which are downstream of elements of input file 2"/>\n+\t\t<param name="spearMan" type="boolean" truevalue="-r" falsevalue="" checked="false" label="spearman option" help="Compute Spearman rho."/>\n+\n+\n+\t\t<conditional name="OptionBuckets">\n+\t\t\t<param name="OBuckets" type="select" label="Plots histogram instead of line plot with given interval size.">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="buckets" type="integer" value="1" label="Interval size"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="OptionMinXaxis">\n+\t\t\t<param name="MinX" type="select" label="Minimum value on the x-axis to plot.">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minXaxis" type="integer" value="1"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="OptionMaxXaxis">\n+\t\t\t<param name="MaxX" type="select" label="Maximum value on the x-axis to plot.">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="maxXaxis" type="integer" value="1"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="OptionTitle">\n+\t\t\t<param name="OTitle" type="select" label="Title for the graph.">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="title" type="text" value=""/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t</inputs>\n+\n+\n+\t<outputs>\n+\t\t<data name="outputFilePng" format="png"/>\n+\t</outputs> \n+\n+\t<help>\n+Give the distances between every data from the first input set and the data from the second input set. It outputs the size distribution. Each point (*x*, *y*) tells you that there exists *y* pairs of elements which are separated by *x* nucleotides.\n+\n+The general algorithm is the following. For each element of the first input set, it finds the closest element of the second set and computes the distance between the two elements. The distance is zero if the two elements overlap. This distance may not exist if the element of the first input set is alone on its chromosome (or contig).\n+\n+Actually, considering an element from the first input set, the algorithm will look at the vicinity of this element (1kb by default). You can increase the size of the vicinity using the appropriate option.\n+\n+As in *compare overlapping*, you can shrink or extend your sets of genomic coordinates, so that you can get the distance between starts of reads and starts or genes, for instance. You can also compute the distance from elements which are on the same strand only (which is not the case by default) or on the opposite strand only.\n+\n+You have several options for the output plot. You can first choose the region on the *x*-axis you want to plot. You can also display histograms instead of line plot. In this case, the data are summed into buckets, whose sizes are given as an option. For instance, a bucket of size *s* at the point (*x*, *y*) means that there are *y* pairs of elements which are separated by *x* to *x + s* nucleotides.\n+\t</help>\n+\n+</tool>\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getDistribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getDistribution.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,237 @@ +<tool id="getDistribution" name="get distribution"> + <description>Get Distribution: Get the distribution of the genomic coordinates along a genome.</description> + <requirements> + <requirement type="set_environment">PYTHONPATH</requirement> + </requirements> + <command interpreter="python"> + ../Java/Python/GetDistribution.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -r $refFile + + #if $optionNbBin.Nb == 'Yes': + -b $optionNbBin.nbBins + #end if + + #if $optionStart.start == 'Yes': + -s $optionStart.startValue + #end if + + #if $optionEnd.end == 'Yes': + -e $optionEnd.endValue + #end if + + #if $optionHeight.height == 'Yes': + -H $optionHeight.heightValue + #end if + + #if $optionWidth.width == 'Yes': + -W $optionWidth.widthValue + #end if + + #if $optionYMin.YMin == 'Yes': + -y $optionYMin.YMinValue + #end if + + #if $optionYMax.YMax == 'Yes': + -Y $optionYMax.YMaxValue + #end if + + #if $optionChrom.chrom == 'Yes': + -c $optionChrom.chromValue + #end if + + #if $optionColor.color == 'Yes': + -l $optionColor.colorValue + #end if + + $bothStrands + $average + $normalize + -m + -o $outputFile + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <param name="refFile" format="fasta" type="data" label="reference genome file"/> + + <conditional name="optionNbBin"> + <param name="Nb" type="select" label="number of points"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="nbBins" type="integer" value="1000" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionChrom"> + <param name="chrom" type="select" label="if you wish to plot only one chromosome, mention the chromosome name"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="chromValue" type="text" value="chromName" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionStart"> + <param name="start" type="select" label="if you wish to plot only one locus, mention its start position"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="startValue" type="integer" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionEnd"> + <param name="end" type="select" label="if you wish to plot only one locus, mention its end position"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="endValue" type="integer" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionHeight"> + <param name="height" type="select" label="height of the figure"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="heightValue" type="integer" value="300" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionWidth"> + <param name="width" type="select" label="width of the figure"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="widthValue" type="integer" value="1000" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionYMin"> + <param name="YMin" type="select" label="minimum value on the y-axis to plot"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="YMinValue" type="integer" value="1000" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionYMax"> + <param name="YMax" type="select" label="maximum value on the y-axis to plot"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="YMaxValue" type="integer" value="1000" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionColor"> + <param name="color" type="select" label="color of the lines (separated by commas and no space)"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="colorValue" type="text" value="red,blue" /> + </when> + <when value="No"> + </when> + </conditional> + + <param name="bothStrands" type="boolean" truevalue="-2" falsevalue="" checked="false" label="plot one curve per strand"/> + <param name="average" type="boolean" truevalue="-a" falsevalue="" checked="false" label="plot the number of element per bin (instead of sum)"/> + + <conditional name="optionNames"> + <param name="names" type="select" label="name for the tags (separated by commas and no space)"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="namesValue" type="text" value="nbElements" /> + </when> + <when value="No"> + </when> + </conditional> + + <param name="normalize" type="boolean" truevalue="-z" falsevalue="" checked="false" label="normalize data (when panel sizes are different)"/> + </inputs> + + <outputs> + <data name="outputFile" format="png" label="[get distribution] output PNG file"/> + </outputs> + + <help> +Print a density profile of the data for each chromosome. You have to provide the reference genome, to know the sizes of the chromosomes. You can also provide the number of points (called *bins*) you want per chromosome. + +By default, only one curve is plotted per chromosome, but you can plot one curve per strand and per chromosome (the minus strand will be plotted with non-positive values on the *y*-axis). + +If you want, you can also plot a specific region, by mentionning the chromosome, the start and the end positions of the region. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getExons.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getExons.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,84 @@ +<tool id="getExons" name="get exons"> + <description>Get the exons of a set of transcripts.</description> + <command interpreter="python"> + ../Java/Python/getExons.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $optionSelect.Value == "Yes": + -s $optionSelect.selectValue + #end if + + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <conditional name="optionSelect"> + <param name="Value" type="select" label="select some of the exons (like '1,2,5..-3,-1')"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="selectValue" type="text" value="None" label="select option" help="like '1,2,5..-3,-1'"/> + </when> + <when value="No"> + </when> + </conditional> + </inputs> + + <outputs> + <data format="gff3" name="outputFileGff" label="[getExons -> gff3] Output File"/> + </outputs> +<tests> + <test> + <param name="FormatInputFileName" value="gtf" /> + <param name="inputFileName" value="genes.gtf" /> + <param name="Value" value="No"/> + <output name="outputFileGff" file="exp_getExons.gff3" /> + </test> +</tests> + + <help> +Provide all the exons of an annotation file. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getIntrons.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getIntrons.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,68 @@ +<tool id="getIntrons" name="get introns"> + <description>Get the introns of a set of transcripts.</description> + <command interpreter="python"> + ../Java/Python/getIntrons.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="gff3" name="outputFileGff" label="[getIntrons -> gff3] Output File"/> + </outputs> +<tests> + <test> + <param name="FormatInputFileName" value="gtf" /> + <param name="inputFileName" value="genes.gtf" /> + <output name="outputFileGff" file="exp_getIntrons.gff3" /> + </test> + </tests> + + <help> +Provide all the introns of an annotation file. + </help> + +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getReadDistribution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getReadDistribution.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,67 @@ +<tool id="getReadDistribution" name="get read distribution"> + <description>Get Read Distribution v1.0.1: Plot the number of identical reads and give the most represented.</description> + <command interpreter="python"> + ../Java/Python/WrappGetReadDistribution.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #elif $formatType.FormatInputFileName == 'fastq': + -f fastq + #end if + + #if $optionnumber.number == 'Yes': + -n $optionnumber.bestNumber + #end if + #if $optionpercent.percent == 'Yes': + -p $optionpercent.percentage + #end if + -o $outputFile + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Sequence input File Format "> + <option value="fasta">fasta</option> + <option value="fastq">fastq</option> + </param> + <when value="fasta"> + <param name="inputFileName" format="fasta" type="data" label="Sequence input File"/> + </when> + <when value="fastq"> + <param name="inputFileName" format="fastq" type="data" label="Sequence input File"/> + </when> + </conditional> + + <conditional name="optionnumber"> + <param name="number" type="select" label="keep the best n"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="bestNumber" type="integer" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionpercent"> + <param name="percent" type="select" label="keep the best n percentage"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="percentage" type="integer" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="outputFile" format="tar" label="[getReadDistribution] tar out file" help="You can not see the results directly from galaxy, but you can download this tar output file."/> + </outputs> + + <help> + This script gives a .tar out file, if you want to take look at the results, you have to download it. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getSizes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getSizes.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,133 @@ +<tool id="GetSizes" name="get sizes"> + <description>Get the sizes of a set of genomic coordinates.</description> + <command interpreter="python"> + ../Java/Python/getSizes.py -i $formatType.inputFileName $formatType.FormatInputFileName + + #if $OptionQuery.OptionQ == 'NONE': + -q size + #else: + $OptionQuery.OptionQ + #end if + + -o $outputFile + + #if $OptionXMax.xMax == "Yes": + -x $OptionXMax.maxValue + #end if + #if $OptionX.xLab == "Yes": + -a $OptionX.xLabValue + #end if + #if $OptionY.yLab == "Yes": + -b $OptionY.yLabValue + #end if + $barPlot + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="-f bed">bed</option> + <option value="-f gff">gff</option> + <option value="-f gff2">gff2</option> + <option value="-f gff3">gff3</option> + <option value="-f sam">sam</option> + <option value="-f gtf">gtf</option> + <option value="-f fasta">fasta</option> + <option value="-f fastq">fastq</option> + </param> + <when value="-f bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="-f gff"> + <param name="inputFileName" format="gff" type="data" label="Input gff File"/> + </when> + <when value="-f gff2"> + <param name="inputFileName" format="gff" type="data" label="Input gff2 File"/> + </when> + <when value="-f gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input gff3 File"/> + </when> + <when value="-f sam"> + <param name="inputFileName" format="sam" type="data" label="Input gff2 File"/> + </when> + <when value="-f gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input gff3 File"/> + </when> + <when value="-f fasta"> + <param name="inputFileName" format="fasta" type="data" label="Input fasta File"/> + </when> + <when value="-f fastq"> + <param name="inputFileName" format="fastq" type="data" label="Input fastq File"/> + </when> + </conditional> + + <conditional name="OptionQuery"> + <param name="OptionQ" type="select" label="mesure type"> + <option value="-q size">size</option> + <option value="-q intron size">intron size</option> + <option value="-q exon size">exon size</option> + <option value="-q 1st exon size">1st exon size</option> + <option value="NONE" selected="true">NONE</option> + </param> + <when value="-q size"> + </when> + <when value="-q intron size"> + </when> + <when value="-q exon size"> + </when> + <when value="-q 1st exon size"> + </when> + <when value="NONE"> + + </when> + </conditional> + + <conditional name="OptionXMax"> + <param name="xMax" type="select" label="maximum x-value to plot"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="maxValue" type="integer" value="1000"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionX"> + <param name="xLab" type="select" label="X label title"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="xLabValue" type="text" value="Size" label="Notice: The title should not have spaces. EX. Size_of_transcript"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionY"> + <param name="yLab" type="select" label="Y label title"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="yLabValue" type="text" value="#_reads" label="Notice: The title should not have spaces. EX. Number_of_reads"/> + </when> + <when value="No"> + </when> + </conditional> + + <param name="barPlot" type="boolean" truevalue="-B" falsevalue="" checked="false" label="use barplot representation"/> + </inputs> + + <outputs> + <data name="outputFile" format="png" label="[Get sizes] output file"/> + </outputs> + + <help> +Get the sequence/annotation size distribution. A point (*x*, *y*) means that *y* elements have a size of *x* nucleotides. + +When your mapping include exon/intron structures, you can decide to count the size of the introns, the sizes of the exons or the size of the first exons. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getWigData.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getWigData.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,25 @@ +<tool id="getWigData" name="get wig data"> + <description>Compute the average data for some genomic coordinates using WIG files</description> + <command interpreter="python"> + ../Java/Python/getWigData.py -i $inputGff3File -f gff3 -w $inputWigFile -t $tagName -$strand -o $outputFile + </command> + + <inputs> + <param name="inputGff3File" type="data" label="Input Gff3 File" format="gff3"/> + <param name="inputWigFile" type="data" label="Input Wig File" format="wig"/> + <param name="tagName" type="text" value="None" label="tag option" help="choose a tag name to write the wig information to output file."/> + <param name="strand" type="boolean" truevalue="-s" falsevalue="" checked="false" label="consider both strands separately."/> + </inputs> + + <outputs> + <data format="gff3" name="outputFile" label="[getWigData -> gff3] Output File"/> + </outputs> + + <help> +Reads a transcript list, computes the average value of some WIG data (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) for each transcript and adds a tag corresponding to this average value to the transcript. + +The script finds all the data which correspond to the genomic coordinates of a transcript, average these data and store the result into a tag. Then, the transcripts are written in an output file, together with the tag. + +You can then plot your data using *plotTranscriptList.py*. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getWigDistance.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getWigDistance.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,25 @@ +<tool id="getWigDistance" name="get wig distance"> + <description>Compute the average data around some genomic coordinates using WIG files (thus covering a large proportion of the genome).</description> + <command interpreter="python"> + ../Java/Python/getWigDistance.py -i $inputGff3File -f gff3 -w $inputWigFile -a 0.0 -d $distance $strand -o $outputFile + </command> + + <inputs> + <param name="inputGff3File" type="data" label="Input Gff3 File" format="gff3"/> + <param name="inputWigFile" type="data" label="Input Wig File" format="wig"/> + <param name="distance" type="integer" value="1000" label="Distance around positions."/> + <param name="strand" type="boolean" truevalue="-s" falsevalue="" checked="false" label="Consider both strands separately."/> + </inputs> + + <outputs> + <data name="outputFile" format="png" label="[getWigDistance] PNG output File"/> + </outputs> + + <help> +Plots the average data contained in a set of WIG files (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) around the first nucleotides of a annotation file. + +The tool needs an transcript list, some WIG files, and a distance. For each transcript, it collects all the values around its first nucleotide, the radius being given by the distance. Then, it computes the average value for each position. A point (*x*, *y*) means that the average value in the WIG file for a nucleotide distant by *x* nucleotides from the first nucleotide of an input transcript is *y*. + +You can possibly use a log scale for the *y*-axis. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/getWigProfile.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/getWigProfile.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,75 @@ +<tool id="getWigProfile" name="get wig profile"> + <description>Compute the average profile of some genomic coordinates using WIG files (thus covering a large proportion of the genome).</description> + <command interpreter="python"> + ../Java/Python/getWigProfile.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #end if + -w $inputWigFile + -p $nbPoints + -d $distance + $strands + -o $outputFilePNG + #if $optionSMO.SMO == 'Yes': + -m $optionSMO.smoothen + #end if + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + </conditional> + + <param name="inputWigFile" type="data" label="Input Wig File" format="wig"/> + <param name="nbPoints" type="integer" value="1000" label="number of points on the x-axis"/> + <param name="distance" type="integer" value="0" label="distance around genomic coordinates"/> + <param name="strands" type="boolean" truevalue="-s" falsevalue="" checked="false" label="consider both strands separately"/> + + <conditional name="optionSMO"> + <param name="SMO" type="select" label="smoothen the curve"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="smoothen" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="outputFilePNG" format="png" label="[getWigProfile] out file"/> + </outputs> + + <help> +Computes the average distribution of the WIG data (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) along the transcripts given in input, and possibly before and after the transcripts. + +The main inputs of the functions are a file containing a list of transcripts (or any sets of genomic interval) and a directory containing a set of WIG files (one file per chromosome, or one file per chromosome and per strand). The function then computes the WIG profile of each transcript. The user can also define a region around the transcripts that should also be plotted (in this case, the profile will include the WIG values which overlap with the transcript as well as the 5' and 3' regions). Since the transcript do not necessarily have the same sizes, all profiles will be extended or shrinked to fit in a size which is given by the user. If the resulting profile is a bit bumpy, the user can also smoothen the curve by using a linear smoothing function (the size of the smoothing window is given by the user). Finally, the user may want to plot the WIG data for the opposite strand too (if the strand specific WUG data are available). + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/mapperAnalyzer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/mapperAnalyzer.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,202 @@ +<tool id="mapperAnalyzer" name="mapper analyzer"> + <description>Read the output of an aligner, print statistics and possibly translate into GFF, BED or GBrowse formats. </description> + <command interpreter="python"> + ../Java/Python/mapperAnalyzer.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'bam': + -f bam + #elif $formatType.FormatInputFileName1 == 'seqmap': + -f seqmap + #end if + + -q $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'fasta': + -k fasta + #elif $formatType2.FormatInputFileName2 == 'fastq': + -k fastq + #end if + + + #if $optionnumber.number == 'Yes': + -n $optionnumber.numberVal + #end if + #if $optionsize.size == 'Yes': + -s $optionsize.sizeVal + #end if + #if $optionidentity.identity == 'Yes': + -d $optionidentity.identityVal + #end if + #if $optionmismatch.mismatch == 'Yes': + -m $optionmismatch.mismatchVal + #end if + #if $optiongap.gap == 'Yes': + -p $optiongap.gapVal + #end if + #if $optiontitle.title == 'Yes': + -t $optiontitle.titleVal + #end if + #if $optionappend.append == 'Yes': + -a $optionappend.appendfile + #end if + + $merge + $remove + $remain + -o $outputFileGFF + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="Input File mapping Format"> + <option value="bed">bed</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="bam">bam</option> + <option value="seqmap" selected="true">seqmap</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File"/> + </when> + <when value="bam"> + <param name="inputFileName1" format="bam" type="data" label="Input File"/> + </when> + <when value="seqmap"> + <param name="inputFileName1" format="seqmap" type="data" label="Input File"/> + </when> + </conditional> + + <conditional name="formatType2"> + <param name="FormatInputFileName2" type="select" label="Reference sequence File Format"> + <option value="fasta" selected="true">fasta</option> + <option value="fastq">fastq</option> + </param> + <when value="fasta"> + <param name="inputFileName2" format="fasta" type="data" label="Reference sequence File Format"/> + </when> + <when value="fastq"> + <param name="inputFileName2" format="fastq" type="data" label="Reference sequence File Format"/> + </when> + </conditional> + + <conditional name="optionnumber"> + <param name="number" type="select" label="max. number of occurrences of a sequence"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="numberVal" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionsize"> + <param name="size" type="select" label="minimum pourcentage of size "> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="sizeVal" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionidentity"> + <param name="identity" type="select" label="minimum pourcentage of identity "> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="identityVal" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionmismatch"> + <param name="mismatch" type="select" label="maximum number of mismatches"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="mismatchVal" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optiongap"> + <param name="gap" type="select" label="maximum number of gaps"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="gapVal" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optiontitle"> + <param name="title" type="select" label="title of the plots "> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="titleVal" type="text" value="title of the UCSC track" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="optionappend"> + <param name="append" type="select" label="append to GFF3 file"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="appendfile" type="data" format="gff3" label="append a file"/> + </when> + <when value="No"> + </when> + </conditional> + + <param name="merge" type="boolean" truevalue="-e" falsevalue="" checked="false" label="merge exons when introns are short "/> + <param name="remove" type="boolean" truevalue="-x" falsevalue="" checked="false" label="remove transcripts when exons are short"/> + <param name="remain" type="boolean" truevalue="-r" falsevalue="" checked="false" label="print the unmatched sequences "/> + </inputs> + + <outputs> + <data name="outputFileGFF" format="gff3" label="[mapperAnalyzer] out file"/> + </outputs> + + <help> +Maybe the first program you may use. It reads a set of mapping given by the tool you have used to map your data on the reference genome and translate it to a set of genomic coordinates. You also have the possibility to extract only those that you are interested in (few matches in the genome, few errors in the mapping, etc.). You can also select those reads which map less than a given of times in the genome. Moreover, you can output the data in various different formats, which you can use to visualize them *via* UCSC genome browser or GBrowse. Unmatched reads can be written in an other file, in case you would like to try to map them with another tool (may sometimes work!). + +You can filter your data according to: + +- number of errors in the mapping + +- number of occurrences of the mapping in the genome + +- size of the read mapped + +- number of gaps in the mapping + +The script needs an input file (your mapped reads) together with its format and the read sequences file together with its format (FASTA or FASTQ). If you want, you can also append the results of this script to another GFF3 file. This is useful when the GFF3 file is the result of the mapping using another tool. + +By default, any gap in the alignment to the reference sequence is treated like an exon. You can decide to remove this feature by merging short introns (actually, gaps). + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/mergeSlidingWindowsClusters.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/mergeSlidingWindowsClusters.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,107 @@ +<tool id="mergeSlidingWindowsClusters" name="merge sliding windows clusters"> + <description>Merges two files containing the results of a sliding windows clustering.</description> + <command interpreter="python"> + ../Java/Python/mergeSlidingWindowsClusters.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + -o $outputFileGff + + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="Input File Format 1"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File 1"/> + </when> + <when value="gff"> + <param name="inputFileName1" format="gff" type="data" label="Input File 1"/> + </when> + <when value="gff2"> + <param name="inputFileName1" format="gff2" type="data" label="Input File 1"/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File 1"/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File 1"/> + </when> + <when value="gtf"> + <param name="inputFileName1" format="gtf" type="data" label="Input File 1"/> + </when> + </conditional> + + <conditional name="formatType2"> + <param name="FormatInputFileName2" type="select" label="Input File Format 2"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName2" format="bed" type="data" label="Input File 2"/> + </when> + <when value="gff"> + <param name="inputFileName2" format="gff" type="data" label="Input File 2"/> + </when> + <when value="gff2"> + <param name="inputFileName2" format="gff2" type="data" label="Input File 2"/> + </when> + <when value="gff3"> + <param name="inputFileName2" format="gff3" type="data" label="Input File 2"/> + </when> + <when value="sam"> + <param name="inputFileName2" format="sam" type="data" label="Input File 2"/> + </when> + <when value="gtf"> + <param name="inputFileName2" format="gtf" type="data" label="Input File 2"/> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3"/> + </outputs> + + <help> +Sliding windows are also useful to compare two (or more!) sets of data. This can be very valuable when you want to compare differential expression in two different conditions. When you have two different sliding windows sets, this function merges them into one, where each window contains the two pieces of information. You may want to plot the data afterwards using the *plot transcript list* function. + </help> + +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/mergeTranscriptLists.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/mergeTranscriptLists.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,150 @@ +<tool id="mergeTranscriptLists" name="merge transcript lists"> + <description>Merge the elements of two lists of genomic coordinates.</description> + <command interpreter="python"> + ../Java/Python/mergeTranscriptLists.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + + $all + $normalize + + #if $OptionDistance.dis == 'Yes': + -d $OptionDistance.disVal + #end if + + #if $OptionColinearOrAntiSens.OptionCA == 'Colinear': + -c + #elif $OptionColinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName1" type="select" label="Input File Format 1"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName1" format="bed" type="data" label="Input File 1"/> + </when> + <when value="gff"> + <param name="inputFileName1" format="gff" type="data" label="Input File 1"/> + </when> + <when value="gff2"> + <param name="inputFileName1" format="gff2" type="data" label="Input File 1"/> + </when> + <when value="gff3"> + <param name="inputFileName1" format="gff3" type="data" label="Input File 1"/> + </when> + <when value="sam"> + <param name="inputFileName1" format="sam" type="data" label="Input File 1"/> + </when> + <when value="gtf"> + <param name="inputFileName1" format="gtf" type="data" label="Input File 1"/> + </when> + </conditional> + + <conditional name="formatType2"> + <param name="FormatInputFileName2" type="select" label="Input File Format 2"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName2" format="bed" type="data" label="Input File 2"/> + </when> + <when value="gff"> + <param name="inputFileName2" format="gff" type="data" label="Input File 2"/> + </when> + <when value="gff2"> + <param name="inputFileName2" format="gff2" type="data" label="Input File 2"/> + </when> + <when value="gff3"> + <param name="inputFileName2" format="gff3" type="data" label="Input File 2"/> + </when> + <when value="sam"> + <param name="inputFileName2" format="sam" type="data" label="Input File 2"/> + </when> + <when value="gtf"> + <param name="inputFileName2" format="gtf" type="data" label="Input File 2"/> + </when> + </conditional> + + + <param name="all" type="boolean" truevalue="-k" falsevalue="" checked="false" label="print all the transcripts, not only those overlapping"/> + <param name="normalize" type="boolean" truevalue="-n" falsevalue="" checked="false" label="normalize the number of reads per cluster by the number of mappings per read "/> + + <conditional name="OptionDistance"> + <param name="dis" type="select" label="provide the number of reads" > + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="disVal" type="integer" value="0" label="max. distance between two transcripts" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionColinearOrAntiSens"> + <param name="OptionCA" type="select" label="Colinear or anti-sens"> + <option value="Colinear">Colinear</option> + <option value="AntiSens">AntiSens</option> + <option value="NONE" selected="true">NONE</option> + </param> + <when value="Colinear"> + </when> + <when value="AntiSens"> + </when> + <when value="NONE"> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3" label="[mergeTranscriptLists]out file"/> + </outputs> + + <help> +The script is similar to *compare overlapping*, except that when data of two different sets overlap, they are merged. You can use the same parameters as *compare overlapping* and use them to look for transcription on both strands, for example. + +Optionally, you can also add to the output all the elements from the first set which do not overlap with the second set. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/modifyGenomicCoordinates.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/modifyGenomicCoordinates.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,135 @@ +<tool id="modifyGenomicCoordinates" name="modify genomic coordinates"> + <description>Extend or shrink a list of genomic coordinates.</description> + <command interpreter="python"> ../Java/Python/modifyGenomicCoordinates.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $OptionStart.start == "Yes": + -s $OptionStart.startValue + #end if + + #if $OptionEnd.end == "Yes": + -e $OptionEnd.endValue + #end if + + #if $OptionFivePrim.five == "Yes": + -5 $OptionFivePrim.fivePValue + #end if + + #if $OptionTroisP.TroisP == "Yes": + -3 $OptionTroisP.ThreePValue + #end if + + -o $outputFile + </command> + + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <conditional name="OptionStart"> + <param name="start" type="select" label="shrink to the start of the feature"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="startValue" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionEnd"> + <param name="end" type="select" label="shrink to the end of the feature"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="endValue" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + + <conditional name="OptionFivePrim"> + <param name="five" type="select" label="extend to the 5' direction"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="fivePValue" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionTroisP"> + <param name="TroisP" type="select" label="extend to the 3' direction"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="ThreePValue" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + + </inputs> + + <outputs> + <data format="gff3" name="outputFile" label="[modifyGenomicCoordinates] Output File"/> + </outputs> + + <help> +This tool reads a list of transcripts and modifies each feature by: + +- shrinking it to the *n* first nucleotides or the *n* last nucleotides, or + +- extending it to *n* nucleotides towards the 5' direction (upstream) or the 3' direction (downstream). + +Note that the 5' or 3' direction depends on the orientation of the feature (the 5' end of a transcript located on the minus strand is on the right hand of this transcript!). + +The tool needs a transcript file, its format, and outputs a new transcript file. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/modifySequenceList.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/modifySequenceList.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,49 @@ +<tool id="modifySequenceList" name="modify sequence list"> + <description>Extend or shring a list of sequences. </description> + <command interpreter="python"> ../Java/Python/modifySequenceList.py -i $inputFile -f fasta + #if $OptionStart.Start == "Yes": + -s $OptionStart.StartVal + #end if + #if $OptionEnd.End == "Yes": + -e $OptionEnd.EndVal + #end if + -o $outputFile + </command> + + + <inputs> + <param name="inputFile" type="data" format="fasta" label="input file"/> + + <conditional name="OptionStart"> + <param name="Start" type="select" label="keep first nucleotides"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="StartVal" type="integer" value="0" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionEnd"> + <param name="End" type="select" label="keep last nucleotides"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="EndVal" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + </inputs> + + <outputs> + <data format="fasta" name="outputFile" label="[modifySequenceList] Output File"/> + </outputs> + + <help> + This tool reads a list of sequences (in multi-FASTA/Q format) that you provide and shrinks each sequence to the *n* first nucleotides or the *n* last nucleotides. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/plotCoverage.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/plotCoverage.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,271 @@\n+<tool id="plotCoverage" name="plot coverage">\n+\t<description>Plot the coverage of the first data with respect to the second one.</description>\n+\t<command interpreter="python">\n+\t\t../Java/Python/WrappPlotCoverage.py -i $formatType.inputFileName1\n+\t\t#if $formatType.FormatInputFileName1 == \'bed\':\n+\t\t\t-f bed\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff\':\n+\t\t\t-f gff\t\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff2\':\n+\t\t\t-f gff2\n+\t\t#elif $formatType.FormatInputFileName1 == \'gff3\':\n+\t\t\t-f gff3\n+\t\t#elif $formatType.FormatInputFileName1 == \'sam\':\n+\t\t\t-f sam\n+\t\t#elif $formatType.FormatInputFileName1 == \'gtf\':\n+\t\t\t-f gtf\n+\t\t#end if\n+\t\t\t\n+\t\t-j $formatType2.inputFileName2\n+\t\t#if $formatType2.FormatInputFileName2 == \'bed\':\n+\t\t\t-g bed\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff\':\n+\t\t\t-g gff\t\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff2\':\n+\t\t\t-g gff2\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gff3\':\n+\t\t\t-g gff3\n+\t\t#elif $formatType2.FormatInputFileName2 == \'sam\':\n+\t\t\t-g sam\n+\t\t#elif $formatType2.FormatInputFileName2 == \'gtf\':\n+\t\t\t-g gtf\n+\t\t#end if\n+\n+\n+\t\t#if $optionRef.Ref == \'Yes\':\n+\t\t\t-q $optionRef.inputSequenceFile\n+\t\t#end if\n+\n+\t\t#if $optionwidth.width == \'Yes\':\n+\t\t\t-w $optionwidth.widthVal\n+\t\t#end if\n+\t\t#if $optionheight.height == \'Yes\':\n+\t\t\t-e $optionheight.heightVal\n+\t\t#end if\n+\t\t#if $optionXlab.Xlab == \'Yes\':\n+\t\t\t-x $optionXlab.XlabVal\n+\t\t#end if\n+\t\t#if $optionYlab.Ylab == \'Yes\':\n+\t\t\t-y $optionYlab.YlabVal\n+\t\t#end if\n+\t\t#if $optiontitle.title == \'Yes\':\n+\t\t\t-t $optiontitle.titleVal\n+\t\t#end if\t\n+\t\n+\t\t#if $optionplusColor.plusColor == \'Yes\':\n+\t\t\t-p $optionplusColor.plusColorVal\n+\t\t#end if\n+\t\t#if $optionminusColor.minusColor == \'Yes\':\n+\t\t\t-m $optionminusColor.minusColorVal\n+\t\t#end if\n+\n+\t\t#if $optionsumColor.sumColor == \'Yes\':\n+\t\t\t-s $optionsumColor.sumColorVal\n+\t\t#end if\n+\t\t#if $optionlineColor.lineColor == \'Yes\':\n+\t\t\t-l $optionlineColor.lineColorVal\n+\t\t#end if\t\n+\n+\t\t$merge\n+\t\t-o $outputFile\n+\t</command>\n+\n+\t<inputs>\n+\t\t<conditional name="formatType">\n+\t\t\t<param name="FormatInputFileName1" type="select" label="Input File Format 1">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName1" format="bed" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName1" format="gff" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName1" format="gff2" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName1" format="gff3" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName1" format="sam" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t\t<when value="gtf">\n+\t\t\t\t<param name="inputFileName1" format="gtf" type="data" label="Input File 1"/>\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="formatType2">\n+\t\t\t<param name="FormatInputFileName2" type="select" label="Input File Format 2">\n+\t\t\t\t<option value="bed">bed</option>\n+\t\t\t\t<option value="gff">gff</option>\n+\t\t\t\t<option value="gff2">gff2</option>\n+\t\t\t\t<option value="gff3">gff3</option>\n+\t\t\t\t<option value="sam">sam</option>\n+\t\t\t\t<option value="gtf">gtf</option>\n+\t\t\t</param>\n+\t\t\t<when value="bed">\n+\t\t\t\t<param name="inputFileName2" format="bed" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff">\n+\t\t\t\t<param name="inputFileName2" format="gff" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff2">\n+\t\t\t\t<param name="inputFileName2" format="gff2" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="gff3">\n+\t\t\t\t<param name="inputFileName2" format="gff3" type="data" label="Input File 2"/>\n+\t\t\t</when>\n+\t\t\t<when value="sam">\n+\t\t\t\t<param name="inputFileName2" format="sam" type="data" label="In'..b'\t<when value="Yes">\n+\t\t\t\t<param name="heightVal" type="integer" value="1000" />\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optiontitle">\n+\t\t\t<param name="title" type="select" label="title for the figure">\n+\t\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="titleVal" type="text" value=" " />\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t\n+\t\t<conditional name="optionXlab">\n+\t\t\t<param name="Xlab" type="select" label="label on the x-axis">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="XlabVal" type="text" value=" "/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optionYlab">\n+\t\t\t<param name="Ylab" type="select" label="label on the y-axis">\n+\t\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="YlabVal" type="text" value=" " />\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optionplusColor">\n+\t\t\t<param name="plusColor" type="select" label="color for the elements on the plus strand">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="plusColorVal" type="text" value="red"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optionminusColor">\n+\t\t\t<param name="minusColor" type="select" label="color for the elements on the minus strand">\n+\t\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="minusColorVal" type="text" value="blue"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optionsumColor">\n+\t\t\t<param name="sumColor" type="select" label="color for 2 strands coverage line">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="sumColorVal" type="text" value="black"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\n+\t\t<conditional name="optionlineColor">\n+\t\t\t<param name="lineColor" type="select" label="color for the lines">\n+\t\t\t\t<option value="Yes">Yes</option>\n+\t\t\t\t<option value="No" selected="true">No</option>\n+\t\t\t</param>\n+\t\t\t<when value="Yes">\n+\t\t\t\t<param name="lineColorVal" type="text" value="black"/>\n+\t\t\t</when>\n+\t\t\t<when value="No">\n+\t\t\t</when>\n+\t\t</conditional>\n+\t\t\n+\t\t<param name="merge" type="boolean" truevalue="-1" falsevalue="" checked="false" label="merge the 2 plots in 1"/>\n+\t</inputs>\n+\n+\t<outputs>\n+\t\t<data name="outputFile" format="tar" label="[plot coverage] tar output file" help="You can not see the results directly from galaxy, but you can download this tar output file."/>\n+\t</outputs> \n+\t\n+ <help>\n+Plot the coverage of the first set of genomic coordinates with respect to the second set of genomic coordinates. For each element of the second set (we will suppose that they are annotated genes), it computes the number of elements of the first set (reads, for instance) which overlap it.\n+\n+Alternatively, if the first file is in GFF format, and contains the **Target** file, you can omit the second file. However, a fasta file corresponding to the second file should be given (to compute the size of the reference elements).\n+\n+The tool produces two plots per gene. The first plot gives the coverage: a point (*x*, *y*) means that *y* reads cover the *x* th nucleotide of the gene. The second figure displays the (possibly spliced) gene in black, and the overlapping reads (blue is colinear, red is anti-sense).\n+\n+This script gives a .tar out file, if you want to take look at the results, you have to download it.\n+ </help>\t\t\n+</tool>\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/plotTranscriptList.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/plotTranscriptList.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,132 @@ +<tool id="plotTranscriptList" name="plot transcript list"> + <description>Plot some information from a list of transcripts. </description> + <command interpreter="python"> + ../Java/Python/plotTranscriptList.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + -x $xVal + -y $yVal + #if $optionz.z == 'Yes': + -z $optionz.zVal + #end if + + -X $XVal + -Y $YVal + -Z $ZVal + + #if $optionxLab.xLab == 'Yes': + -n $optionxLab.labVal + #end if + #if $optionyLab.yLab == 'Yes': + -m $optionyLab.labVal + #end if + + $log + -s $shape + -b $bucket + + -o $outputFilePNG + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="gtf">gtf</option> + </param> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <param name="xVal" type="text" value="None" label="tag for the x value"/> + <param name="yVal" type="text" value="None" label="tag for the y value"/> + + <conditional name="optionz"> + <param name="z" type="select" label="tag for the z value "> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="zVal" type="text" value="None"/> + </when> + <when value="No"> + </when> + </conditional> + + <param name="XVal" type="float" value="0.0" label="value for x when tag is not present "/> + + <param name="YVal" type="float" value="0.0" label="value for y when tag is not present"/> + + <param name="ZVal" type="float" value="0.0" label="value for z when tag is not present (if applicable)"/> + + <conditional name="optionxLab"> + <param name="xLab" type="select" label="label on the x-axis "> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="labVal" type="text" value=" "/> + </when> + <when value="No"> + </when> + </conditional> + <conditional name="optionyLab"> + <param name="yLab" type="select" label="label on the y-axis "> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="labVal" type="text" value=" "/> + </when> + <when value="No"> + </when> + </conditional> + + <param name="log" type="select" label="use log on x- or y-axis (write 'x', 'y' or 'xy')"> + <option value="" selected="true">No</option> + <option value="-l x">log on the x-axis</option> + <option value="-l y">log on the y-axis</option> + <option value="-l xy">log on the x- and y-axis</option> + </param> + + <param name="shape" type="text" value="barplot" label="shape of the plot [format: choice (barplot, line, points, heatPoints)]"/> + <param name="bucket" type="float" value="1.0" label="bucket size (for the line plot)"/> + + </inputs> + + <outputs> + <data name="outputFilePNG" format="png" label="[plot transcript list] output file"/> + </outputs> + + <help> +Plot the data attached as tags in a transcript list. This can be used for displaying the comparison of different sets of sliding windows, for instance. + +The tool reads the tags of a transcript file (actually, a GFF3 file). It considers more specifically the tag names that you specify as parameter. If you use only one tag name, you can display a line plot. In this case, you have to specify a bucket size *s* (which is by defaut 1) and a point (*x*, *y*) tells you that there are *y* transcripts with tag values *x* to *x + s*. + +You can display could plots if you use two tag names. Each point represents the values of the two tags of a transcript. If you use three variables, the third variable will be the color of the point. You can also use a log scale and name the axes of the plot. + +Each transcript must contain the tags which are specified. If not, you should provide a default value, which is used when the tag is not present. + +If you use a cloud plot, you can compute the Spearman's rho to quantify a correlation between your two tag values. + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/removeExonLines.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/removeExonLines.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,15 @@ +<tool id="removeExonLines" name="remove exon lines"> + <description>Removes the lines containing Exon.</description> + <command interpreter="sh"> ../Java/Python/removeExonLines.sh $inputFile > $outputFile </command> + <inputs> + <param name="inputFile" type="data" label="Input File" format="gff3"/> + </inputs> + + <outputs> + <data format="gff3" name="outputFile" label="[removeExonLine] Output File"/> + </outputs> + + <help> + command example: sh removeExonLines.sh input.gff3 + </help> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/restrictFromSize.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/restrictFromSize.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,99 @@ +<tool id="restrictFromSize" name="restrict from size"> + <description>Select the elements of a list of sequences or transcripts with a given size.</description> + <command interpreter="python"> + ../Java/Python/restrictFromSize.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #elif $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $OptionMax.maximum == "Yes": + -M $OptionMax.max + #end if + #if $OptionMin.minimum == "Yes": + -m $OptionMin.min + #end if + + -o $outputFileGff + </command> + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="fasta">fasta</option> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="fasta"> + <param name="inputFileName" format="fasta" type="data" label="Input File"/> + </when> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <conditional name="OptionMax"> + <param name="maximum" type="select" label="maximum number of np"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="max" type="integer" value="1" help="Be Careful! The value must be upper than 0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionMin"> + <param name="minimum" type="select" label="minimum number of np"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="min" type="integer" value="1" help="Be Careful! The value must be upper than 0"/> + </when> + <when value="No"> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="outputFileGff" format="gff3" label="[restrictFromSize] Output File"/> + </outputs> + + <help> +Reads a list of sequences or genomic coordinates and outputs those which are longer and / or shorter than a given size ---which you provide. + </help> + +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/restrictTranscriptList.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/restrictTranscriptList.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,121 @@ +<tool id="restrictTranscriptList" name="restrict transcript list"> + <description>Select the features which are located in a given locus.</description> + <command interpreter="python"> ../Java/Python/restrictTranscriptList.py -i $formatType.inputFileName + #if $formatType.FormatInputFileName == 'bed': + -f bed + #elif $formatType.FormatInputFileName == 'gff': + -f gff + #elif $formatType.FormatInputFileName == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName == 'sam': + -f sam + #elif $formatType.FormatInputFileName == 'gtf': + -f gtf + #end if + + #if $OptionChrom.Chrom == "Yes": + -c $OptionChrom.ChromName + #end if + + #if $OptionStart.start == "Yes": + -s $OptionStart.startValue + #end if + + #if $OptionEnd.end == "Yes": + -e $OptionEnd.endValue + #end if + + -o $outputFile + + </command> + + + <inputs> + <conditional name="formatType"> + <param name="FormatInputFileName" type="select" label="Input File Format"> + <option value="bed">bed</option> + <option value="gff">gff</option> + <option value="gff2">gff2</option> + <option value="gff3">gff3</option> + <option value="sam">sam</option> + <option value="gtf">gtf</option> + </param> + <when value="bed"> + <param name="inputFileName" format="bed" type="data" label="Input File"/> + </when> + <when value="gff"> + <param name="inputFileName" format="gff" type="data" label="Input File"/> + </when> + <when value="gff2"> + <param name="inputFileName" format="gff2" type="data" label="Input File"/> + </when> + <when value="gff3"> + <param name="inputFileName" format="gff3" type="data" label="Input File"/> + </when> + <when value="sam"> + <param name="inputFileName" format="sam" type="data" label="Input File"/> + </when> + <when value="gtf"> + <param name="inputFileName" format="gtf" type="data" label="Input File"/> + </when> + </conditional> + + <conditional name="OptionChrom"> + <param name="Chrom" type="select" label="chromosome name"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="ChromName" type="text" value="None"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionStart"> + <param name="start" type="select" label="start region of the locus"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="startValue" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionEnd"> + <param name="end" type="select" label="end region of the locus"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="endValue" type="integer" value="0"/> + </when> + <when value="No"> + </when> + </conditional> + </inputs> + + <outputs> + <data format="gff3" name="outputFile" label="[restrictTranscriptList] Output File"/> + </outputs> + + <help> +Reads a list of genomic coordinates and outputs those which on a given chromosome and / or between two given positions. + </help> +<tests> + <test> + <param name="FormatInputFileName" value="gtf" /> + <param name="inputFileName" value="genes.gtf" /> + <param name="Chrom" value="Yes"/> + <param name="ChromName" value="I"/> + <param name="start" value="No" /> +<param name="end" value="No" /> + <output name="outputFile" file="exp_restrictTranscriptList.gff3" /> + </test> + </tests> + +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c SMART/galaxy/trimSequences.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/trimSequences.xml Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,79 @@ +<tool id="trimSequences" name="trim sequences"> + <description>Remove the 5' and/or 3' adapters of a list of reads.</description> + <command interpreter="python"> ../Java/Python/trimSequences.py -i $inputFile -f fastq + #if $OptionFPADP.FPADP == "Yes": + -5 $OptionFPADP.fivePAdaptor + #end if + #if $OptionTPADP.TPADP == "Yes": + -3 $OptionTPADP.threePAdaptor + #end if + -e $errors + $indels + $noAdaptor5p $noAdaptorFile5p + $noAdaptor3p $noAdaptorFile3p + -o $outputFile + + </command> + + + <inputs> + <param name="inputFile" type="data" label="Input fastq File" format="fastq"/> + + <conditional name="OptionFPADP"> + <param name="FPADP" type="select" label="5' adapter"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="fivePAdaptor" type="text" value="None" /> + </when> + <when value="No"> + </when> + </conditional> + + <conditional name="OptionTPADP"> + <param name="TPADP" type="select" label="3' adapter"> + <option value="Yes">Yes</option> + <option value="No" selected="true">No</option> + </param> + <when value="Yes"> + <param name="threePAdaptor" type="text" value="None" /> + </when> + <when value="No"> + </when> + </conditional> + + <param name="errors" type="integer" label="number of errors in percent" value="0" /> + <param name="indels" type="boolean" truevalue="-d" falsevalue="" checked="false" label="indels option" help="also accept indels"/> + <param name="noAdaptor5p" type="boolean" truevalue="-n" falsevalue="" checked="false" label="noAdaptor 5' option" help="file name where to print sequences with no 5' adapter "/> + <param name="noAdaptor3p" type="boolean" truevalue="-m" falsevalue="" checked="false" label="noAdaptor 3' option" help="file name where to print sequences with no 3' adapter "/> + + </inputs> + + <outputs> + <data format="fastq" name="outputFile" label="[trim sequences] output file"/> + <data name="noAdaptorFile5p" format="fastq" label="[trim sequences] noAdapter5p file"> + <filter>noAdaptor5p</filter> + </data> + <data name="noAdaptorFile3p" format="fastq" label="[trim sequences] noAdapter3p file"> + <filter>noAdaptor3p</filter> + </data> + </outputs> + + <help> +This function removes the adaptor from the 5' or 3' end of your reads. It can even recognize the adaptators which are partially present. You can specify whether you are ready to accept indels or not. + </help> + <tests> + <test> + <param name="inputFile" value="short_fastq.fastq" /> + <param name="FPADP" value="Yes"/> + <param name="fivePAdaptor" value="AAAA" /> + <param name="TPADP" value="No"/> + <param name="errors" value="1"/> + <param name="indels" value="False"/> + <param name="noAdaptor5p" value="False"/> + <param name= "noAdaptor3p" value="False"/> + <output name="outputFile" file="exp_trimsequences_short_fastq.fastq" /> + </test> + </tests> +</tool> |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/LoggerFactory.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/LoggerFactory.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,139 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## @mainpage Documentation of the REPET API +# +# Welcome to the API documentation! +# This API is a set of packages and classes for pipeline(s) development. +# +# @par The "logger" package +# +# Logging is managed via LoggerFactory. This class creates instances of logging.logging python class. It's strongly encouraged to use this factory each time you need to log something. +# +# @par The "checker" package +# +# This package is a set of classes designed to facilitate development of different kind of checks: filesystem checks, environment checks, configuration file checks ... +# +# Classes should subclass checker::IChecker or if a logger is needed: checker::AbstractChecker. +# +# Methods should raise checker::CheckerException. +# +# Use checker::ConfigChecker and checker::ConfigException for configuration files checks. +# +# checker::CheckerUtils is a set of small static methods shared by other classes of checker package. +# +# @par The "coord" package +# +# This package is a set of classes dedicated to coordinates manipulations. +# +# A coord::Range instance records a region on a given sequence (start, end and sequence name). +# +# A coord::Map instance is a coord::Range instance and record a named region on a given sequence (start, end, sequence name and name). +# +# A coord::Set instance is a coord::Map instance and record a named region on a given sequence with an identifier (start, end, sequence name, name and id). +# +# A coord::Align instance handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity). +# +# A coord::Path instance is a coord::Align instance and handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) with an identifier. +# +# A coord::Match instance is a coord::Path instance and handle a chain of match(es) between two sequences, query and subject, with an identifier and the length of the input sequences. +# +# coord::Align, coord::Map, coord::Path and coord::Set come with utils classes: coord::AlignUtils, coord::MapUtils, coord::PathUtils and coord::SetUtils. +# +# @par The "seq" package +# +# This package a set of classes dedicated to sequences manipulations. +# +# A seq::Bioseq instance records a sequence with its header. seq::Bioseq comes with an utils class: seq::BioseqUtils. +# +# A seq::BioseqDB instance handle a collection of a Bioseq (header-sequence). +# +# A seq::AlignedBioseqDB instance is a multiple sequence alignment representation. +# +# A seq::FastaUtils is a set of static methods for fasta file manipulation. +# +# @par The "sql" package +# +# This package is dedicated to persistance of coord package objects. +# All classes come with dedicated interfaces. Use these interfaces for class manipulation. +# Class names patterns are ITable*Adaptator and Table*Adaptator. +# +# sql::ITablePathAdaptator, sql::TablePathAdaptator / +# sql::ITableSetAdaptator, sql::TableSetAdaptator / +# sql::ITableSeqAdaptator, sql::TableSeqAdaptator / +# sql::ITableMapAdaptator, sql::TableMapAdaptator / +# sql::ITableMatchAdaptator, sql::TableMatchAdaptator. +# + +import logging +import sys + +DEFAULT_LEVEL = 1 +DEFAULT_FORMAT = "%(asctime)s - %(module)s - %(levelname)s - %(message)s" +DATE_FORMAT = "%Y-%m-%d %H:%M:%S" + +## Use this class to create a instance of logging class. +# +class LoggerFactory(object): + + def createLogger(name, verbosity = DEFAULT_LEVEL, format = DEFAULT_FORMAT, out = sys.stdout): + log = logging.getLogger(name) + + hasStreamHandler = False + for handler in log.handlers: + if handler.__class__ == logging.StreamHandler: + hasStreamHandler = True + break + if not hasStreamHandler: + formatter = logging.Formatter(format, DATE_FORMAT) + handler = logging.StreamHandler(out) + handler.setFormatter(formatter) + log.addHandler(handler) + + LoggerFactory.setLevel(log, verbosity) + return log + + createLogger = staticmethod(createLogger) + + def setLevel(log, verbosity): + log.disabled = False + if verbosity >= 4: + log.setLevel(logging.DEBUG) + elif verbosity == 3: + log.setLevel(logging.INFO) + elif verbosity == 2: + log.setLevel(logging.WARNING) + elif verbosity == 1: + log.setLevel(logging.ERROR) + elif verbosity == 0: + log.disabled = True + + setLevel = staticmethod(setLevel) |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/AbstractChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/AbstractChecker.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,61 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.IChecker import IChecker +from commons.core.LoggerFactory import LoggerFactory + + +## Enable a Logger in your Checker. +# +# Subclasses of AbstractChecker have a already a logger enabled (referenced by self._log attribute). Subclasses also already implements IChecker. +# All you have to do is to call __init__() method in your own constructor. +class AbstractChecker( IChecker ): + + ## Constructor + # + # @param logFileName name of log file where logger outputs + # + def __init__(self, logFileName): + self._log = LoggerFactory.createLogger(logFileName) + + + ## Set (change) default logger + # + # @param logger a new logger + # + def setLogger(self, logger): + self._log = logger + + + ## Return the logger instance + # + def getLogger(self): + return self._log |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/CheckerException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/CheckerException.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,52 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Exception raised during check +# +# This class wraps Exception class +# +class CheckerException( Exception ): + + ## Constructor + # + # @param msg message embedded in Exception class + def __init__(self,msg=""): + self.messages = [] + self.msg = msg + Exception.__init__(self, msg) + + + def setMessages(self,lMessages): + self.messages = lMessages + + + def getMessages(self): + return self.messages |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/CheckerUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/CheckerUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,316 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import os\n+import sys\n+import re\n+import glob\n+import ConfigParser\n+from ConfigParser import NoOptionError\n+from ConfigParser import NoSectionError\n+from commons.core.checker.CheckerException import CheckerException\n+\n+\n+## A set of static methods used to perform checks.\n+#\n+#\n+class CheckerUtils( object ):\n+ \n+ ## Check if blastName param is in ["blastn", "blastp", "blastx", "tblastn", "tblastx"]\n+ # \n+ # @param blastName name to check\n+ # @return True if name is in list False otherwise\n+ #\n+ def isBlastNameNotInBlastValues( blastName ):\n+ blastValuesSet = set( ["blastn", "blastp", "blastx", "tblastn", "tblastx"] )\n+ blastNameSet = set( [ blastName ] )\n+ return not blastNameSet.issubset( blastValuesSet )\n+ \n+ isBlastNameNotInBlastValues = staticmethod( isBlastNameNotInBlastValues )\n+ \n+ \n+ ## Check if param is NOT "TRUE" and NOT false "FALSE"\n+ #\n+ # @param param str to check\n+ # @return True if param is not eq to "TRUE" AND not eq to "FALSE", false otherwise \n+ #\n+ def isNotTRUEisNotFALSE( param ):\n+ return param != "TRUE" and param != "FALSE"\n+ \n+ isNotTRUEisNotFALSE = staticmethod( isNotTRUEisNotFALSE )\n+ \n+ \n+ ## Check if resource (file or dir) do NOT exists\n+ # \n+ # @param resource file or dir to check\n+ # @return True if resource exists False otherwise\n+ #\n+ def isRessourceNotExits( resource ):\n+ return not os.path.exists( resource )\n+ \n+ isRessourceNotExits = staticmethod( isRessourceNotExits )\n+ \n+ \n+ ## Check a specific E-value format: de-dd \n+ #\n+ # @param param E-value to check\n+ # @return True if format is de-dd False otherwise\n+ #\n+ def isNotAeValueWithOneDigit2DecimalsAtLeast( param ):\n+ # \\d\\d stands for 2 digits and more ???\n+ return not re.match( "\\de\\-\\d\\d", param )\n+ \n+ isNotAeValueWithOneDigit2DecimalsAtLeast = staticmethod( isNotAeValueWithOneDigit2DecimalsAtLeast )\n+ \n+ \n+ ## Check a number format\n+ #\n+ # @param param value to check\n+ # @return True if param is a number (d+) False otherwise\n+ #\n+ def isNotANumber( param ):\n+ return not re.match( "\\d+", param )\n+ \n+ isNotANumber = staticmethod( isNotANumber )\n+ \n+\n+ ## Check if an executable is in the user\'s PATH\n+ #\n+ # @param exeName name of t'..b'me)\n+ \n+ checkSectionInConfigFile = staticmethod( checkSectionInConfigFile )\n+ \n+ \n+ ## Check if an option is in a specified section in the configuration file\n+ #\n+ # @param config filehandle of configuration file\n+ # @param sectionName string of section name\n+ # @param optionName string of option name to check\n+ # @exception NoOptionError: if option not found raise a NoOptionError\n+ #\n+ def checkOptionInSectionInConfigFile( config, sectionName, optionName ):\n+ config.get( sectionName, optionName )\n+ \n+ checkOptionInSectionInConfigFile = staticmethod( checkOptionInSectionInConfigFile )\n+ \n+ \n+ ## Check version number coherency between configFile and CHANGELOG\n+ #\n+ # @param config ConfigParser Instance of configuration file\n+ # @param changeLogFileHandle CHANGELOG file handle\n+ # @exception NoOptionError: if option not found raise a NoOptionError\n+ #\n+ def checkConfigVersion( changeLogFileHandle, config ):\n+ line = changeLogFileHandle.readline()\n+ while not line.startswith("REPET release "):\n+ line = changeLogFileHandle.readline()\n+ numVersionChangeLog = line.split()[2]\n+ \n+ numVersionConfig = config.get("repet_env", "repet_version")\n+ \n+ if not numVersionChangeLog == numVersionConfig:\n+ message = "*** Error: wrong config file version. Expected version num is " + numVersionChangeLog + " but actual in config file is " + numVersionConfig\n+ raise CheckerException(message)\n+ \n+ checkConfigVersion = staticmethod( checkConfigVersion )\n+ \n+ \n+ ## Get version number from CHANGELOG\n+ #\n+ # @param changeLogFile CHANGELOG file name\n+ #\n+ def getVersionFromChangelogFile(changeLogFileName):\n+ with open(changeLogFileName) as changeLogFileHandle:\n+ line = changeLogFileHandle.readline()\n+ while not line.startswith("REPET release "):\n+ line = changeLogFileHandle.readline()\n+ numVersionChangeLog = line.split()[2]\n+ return numVersionChangeLog\n+ \n+ \n+ getVersionFromChangelogFile = staticmethod( getVersionFromChangelogFile )\n+ \n+ \n+ ## Check if headers of an input file contain only alpha numeric characters and "_ : . -"\n+ #\n+ # @param fileHandler file handle\n+ # @exception CheckerException if bad header raise a CheckerException\n+ #\n+ def checkHeaders( fileHandler ):\n+ lHeaders = CheckerUtils._getHeaderFromFastaFile(fileHandler)\n+ p = re.compile(\'[^a-zA-Z0-9_:\\.\\-]\', re.IGNORECASE)\n+ lWrongHeaders = []\n+ for header in lHeaders:\n+ errList=p.findall(header)\n+ if len( errList ) > 0 :\n+ lWrongHeaders.append(header)\n+ if lWrongHeaders != []:\n+ exception = CheckerException()\n+ exception.setMessages(lWrongHeaders)\n+ raise exception\n+ \n+ checkHeaders = staticmethod( checkHeaders ) \n+ \n+ \n+ def _getHeaderFromFastaFile( inFile ):\n+ lHeaders = []\n+ while True:\n+ line = inFile.readline()\n+ if line == "":\n+ break\n+ if line[0] == ">":\n+ lHeaders.append( line[1:-1] )\n+ return lHeaders\n+ \n+ _getHeaderFromFastaFile = staticmethod( _getHeaderFromFastaFile ) \n+\n+\n+ ## Return True if an option is in a specified section in the configuration file, False otherwise\n+ #\n+ # @param config handler of configuration file\n+ # @param sectionName string of section name\n+ # @param optionName string of option name to check\n+ #\n+ def isOptionInSectionInConfig( configHandler, section, option ):\n+ try:\n+ CheckerUtils.checkOptionInSectionInConfigFile( configHandler, section, option ) \n+ except NoOptionError:\n+ return False\n+ return True\n+ \n+ isOptionInSectionInConfig = staticmethod( isOptionInSectionInConfig )\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/ConfigChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/ConfigChecker.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,226 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import re\n+import sys\n+from commons.core.utils.RepetConfigParser import RepetConfigParser\n+from commons.core.checker.ConfigValue import ConfigValue\n+from commons.core.checker.IChecker import IChecker\n+from commons.core.checker.RepetException import RepetException\n+from commons.core.utils.FileUtils import FileUtils\n+\n+\n+class Rule(object):\n+ \n+ def __init__(self, mandatory= False, isPattern=False, type="", set=(), help =""):\n+ self.mandatory = mandatory\n+ self.isPattern = isPattern\n+ self.type = type\n+ self.set = set\n+ self.help = help\n+ \n+class ConfigRules(object):\n+ \n+ def __init__(self, configName = "", configDescription = ""):\n+ self.configName = configName\n+ self.configDescription = configDescription\n+ self.dRules4Sections={}\n+ \n+ def _addRule(self, section, option="DEFAULT", mandatory=False, isPattern=False, type="", set=(), help =""):\n+ if not self.dRules4Sections.has_key(section):\n+ self.dRules4Sections[section] = {}\n+ self.dRules4Sections[section][option]=Rule(mandatory, isPattern, type.lower(), set) \n+ \n+ def addRuleSection(self, section, mandatory=False, isPattern=False, help = ""):\n+ self._addRule(section = section, option = "DEFAULT", mandatory = mandatory, isPattern = isPattern, help = "")\n+ \n+ def addRuleOption(self, section, option, mandatory=False, isPattern=False, type="", set=(), help = ""):\n+ self._addRule(section = section, option = option, mandatory = mandatory, isPattern = isPattern, type = type, set=set , help = "")\n+ \n+ def isSectionMandatory(self, section):\n+ if self.dRules4Sections.has_key(section):\n+ if self.dRules4Sections[section].has_key("DEFAULT"):\n+ return self.dRules4Sections[section]["DEFAULT"].mandatory\n+ return False\n+ \n+ def isOptionMandatory(self, section, option):\n+ if self.dRules4Sections.has_key(section):\n+ if self.dRules4Sections[section].has_key(option):\n+ return self.dRules4Sections[section][option].mandatory\n+ return False\n+ \n+ def getRule(self, section, option):\n+ if self.dRules4Sections.has_key(section):\n+ if self.dRules4Sections[section].has_key(option):\n+ return self.dRules4Sections[section][option]\n+ '..b'on(sectionName, optionName):\n+ missingOption += "\\n - [%s]: %s" % (sectionName, optionName)\n+ if missingOption != "":\n+ raise RepetException ("Error in configuration file %s, following options are missing: %s\\n" % (self._configFileName, missingOption))\n+ \n+ def getSectionNamesAccordingPatternRules (self, sectionWordOrPattern, isPattern): \n+ lSectionsFoundAccordingPatternRules=[]\n+ if isPattern == False:\n+ if self._iRawConfig.has_section(sectionWordOrPattern):\n+ lSectionsFoundAccordingPatternRules.append(sectionWordOrPattern)\n+ else:\n+ for sectionName in self._iRawConfig.sections():\n+ if re.search(sectionWordOrPattern, sectionName, re.IGNORECASE):\n+ lSectionsFoundAccordingPatternRules.append(sectionName)\n+ return lSectionsFoundAccordingPatternRules\n+ \n+ def getOptionsNamesAccordingPatternRules(self, sectionName, optionWordOrPattern, isPattern):\n+ lOptionsFoundAccordingPatternRules=[]\n+ if isPattern == False:\n+ if self._iRawConfig.has_option(sectionName, optionWordOrPattern):\n+ lOptionsFoundAccordingPatternRules.append(optionWordOrPattern)\n+ else :\n+ for optionName in self._iRawConfig.options(sectionName):\n+ if re.search(optionWordOrPattern, optionName, re.IGNORECASE)!= None:\n+ lOptionsFoundAccordingPatternRules.append(optionName)\n+ return lOptionsFoundAccordingPatternRules\n+ \n+ def extendConfigRulesWithPatternRules(self):\n+ for sectionName in self._iConfigRules.dRules4Sections.keys():\n+ dRules4OptionsOfThisSection = self._iConfigRules.dRules4Sections[sectionName] \n+ lRawSections=[]\n+ if dRules4OptionsOfThisSection.has_key("DEFAULT"):\n+ mandatorySection = dRules4OptionsOfThisSection["DEFAULT"].mandatory\n+ isPatternSection = dRules4OptionsOfThisSection["DEFAULT"].isPattern\n+ lRawSections=self.getSectionNamesAccordingPatternRules(sectionName, isPatternSection)\n+ for rawSectionName in lRawSections:\n+ self._iExtendedConfigRules.addRuleSection(rawSectionName, "DEFAULT", mandatorySection )\n+ if mandatorySection and (len(lRawSections)==0):\n+ self._iExtendedConfigRules.addRuleSection(sectionName, "DEFAULT", mandatorySection )\n+ else:\n+ lRawSections.append(sectionName) \n+ for optionName in dRules4OptionsOfThisSection.keys():\n+ setOption = dRules4OptionsOfThisSection[optionName].set\n+ isPatternOption = dRules4OptionsOfThisSection[optionName].isPattern\n+ mandatoryOption = dRules4OptionsOfThisSection[optionName].mandatory\n+ typeOption = dRules4OptionsOfThisSection[optionName].type\n+ if optionName != "DEFAULT":\n+ for rawSectionName in lRawSections:\n+ lRawOptions=self.getOptionsNamesAccordingPatternRules(rawSectionName, optionName, isPatternOption)\n+ for rawOptionName in lRawOptions:\n+ self._iExtendedConfigRules.addRuleOption(rawSectionName, rawOptionName, mandatoryOption, False, typeOption, setOption)\n+ if mandatoryOption and (len(lRawOptions)==0):\n+ self._iExtendedConfigRules.addRuleOption(rawSectionName, optionName, mandatoryOption, False, typeOption, setOption)\n+ \n+ def getConfig(self):\n+ self.checkIfExistsConfigFile()\n+ iConfig = self.readConfigFile()\n+ self.setRawConfig(iConfig)\n+ self.extendConfigRulesWithPatternRules()\n+ self.checkMandatorySections()\n+ self.checkMandatoryOptions()\n+ self.setConfig(iConfig)\n+ return self._iRawConfig\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/ConfigException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/ConfigException.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,53 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.checker.RepetException import RepetException + +## A exception raised by check() method of class ConfigChecker +# +# This class allow storage of multiple messages (see messages attribute). +# Example: use one instance of ConfigException class for one section in configuration file. +# All messages relatives to this section are stored in messages attribute. +class ConfigException( RepetException ): + + ## Constructor + # + # @param msg message embedded in Exception class + # + def __init__(self, msg, messages = []): + RepetException.__init__(self, msg) + self.messages = messages + + def getMessages(self): + return self.messages + + def setMessages(self, messages): + self.messages = messages + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/ConfigValue.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/ConfigValue.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,70 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +class ConfigValue(object): + + def __init__(self): + self.dOptionsValues4Sections={} + + def has_section(self,sectionName): + return self.dOptionsValues4Sections.has_key(sectionName) + + def has_option(self, sectionName, optionName): + isOptionExist = False + if self.has_section(sectionName): + isOptionExist = self.dOptionsValues4Sections[sectionName].has_key(optionName) + return isOptionExist + + def sections(self): + lSectionsKeys = self.dOptionsValues4Sections.keys() + return lSectionsKeys + + def options(self, sectionName): + lOptionsKeys = [] + if self.has_section(sectionName): + lOptionsKeys = self.dOptionsValues4Sections[sectionName].keys() + return lOptionsKeys + + def get(self, sectionName, optionName): + if self.has_option(sectionName, optionName): + return self.dOptionsValues4Sections[sectionName][optionName] + return None + + def set(self, sectionName, optionName, optionValue): + if not (self.has_section(sectionName)): + self.dOptionsValues4Sections[sectionName] = {} + self.dOptionsValues4Sections[sectionName][optionName] = optionValue + + def setdOptionsValues4Sections(self, dOptionsValues4Sections): + self.dOptionsValues4Sections = dOptionsValues4Sections + + def __eq__(self, o): + return self.dOptionsValues4Sections == o.dOptionsValues4Sections |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/IChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/IChecker.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,45 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +## Interface for a checker +# +# This class emulates an interface for a checker. +# +# All checkers are subclasses of IChecker. +# +class IChecker( object ): + + ## perform check, raise a CheckerException if error occurred + # + # @param arg a collecting parameter: put here all you need to perform check + # + def check(self, arg=""): + pass |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/OldConfigChecker.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/OldConfigChecker.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,101 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import ConfigParser +from ConfigParser import NoOptionError +from commons.core.checker.IChecker import IChecker +from commons.core.checker.ConfigException import ConfigException + + +## A checker for a configuration file +# +# +# A configuration file is formatted as follow: +# +# [section1] +# +# option_name1: option_value1 +# +# option_name2: option_value2 +# +# option_name3: option_value3 +# +# [section2] +# +# ... +# +# +# This class performs 3 checkes on a configuration file: +# +# (i) check if file exists +# +# (ii) check if section exists +# +# (iii) check if option exists +# +class ConfigChecker( IChecker ): + + ## Constructor A checker for configuration file. + # + # @param sectionName name of section to check in configuration file + # @param optionsDict dictionary with option(s) to check as keys and empty strings ("") as values + def __init__ (self, sectionName, optionsDict): + self._sectionName = sectionName + self._optionsDict = optionsDict + + + ## Perform 3 checks : file exists, sections exists, option exists + # + # @param configFile configuration file to check + # @exception ConfigException with a list of messages + def check (self, configFile): + config = ConfigParser.ConfigParser() + msg = [] + try: + config.readfp( open(configFile) ) + except IOError, e: + msg.append("CONFIG FILE not found - " + e.message) + raise ConfigException("", msg) + + if not (config.has_section(self._sectionName)): + msg.append("[" + self._sectionName + "]" + " section not found - ") + raise ConfigException("", msg) + + isExceptionOccured = False + for key in self._optionsDict.keys(): + try: + self._optionsDict[key] = config.get(self._sectionName, key) + except NoOptionError, e: + msg.append("[" + self._sectionName + "]" + " - " + e.message) + isExceptionOccured = True + + if (isExceptionOccured): + raise ConfigException("", msg) |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/checker/RepetException.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/checker/RepetException.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,51 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +class RepetException(Exception): + + def __init__(self, msg): + Exception.__init__(self) + self._message = msg + + def __str__(self): + return self._message + + def getMessage(self): + return self._message + + def setMessage(self, msg): + self._message = msg + + +class RepetDataException(RepetException): + + def __init__(self, msg): + RepetException.__init__(self, msg) \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/Align.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Align.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,428 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+import time\n+\n+from commons.core.coord.Range import Range\n+from commons.core.coord.Map import Map\n+\n+\n+## Handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity)\n+#\n+class Align( object ):\n+ \n+ ## Constructor\n+ #\n+ # @param range_q: a Range instance for the query\n+ # @param range_s: a Range instance for the subject\n+ # @param e_value: E-value of the match \n+ # @param identity: identity percentage of the match\n+ # @param score: score of the match\n+ #\n+ def __init__(self, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0):\n+ self.range_query = range_q\n+ self.range_subject = range_s\n+ self.e_value = float(e_value)\n+ self.score = float(score)\n+ self.identity = float(identity)\n+ \n+ ## Return True if the instance is empty, False otherwise\n+ #\n+ def isEmpty(self):\n+ return self.range_query.isEmpty() or self.range_subject.isEmpty()\n+ \n+ ## Equal operator\n+ #\n+ def __eq__(self, o):\n+ if self.range_query==o.range_query and self.range_subject==o.range_subject and \\\n+ self.e_value==o.e_value and self.score==o.score and self.identity==o.identity:\n+ return True\n+ return False\n+ \n+ ## Unequal operator\n+ #\n+ # @param o a Range instance\n+ #\n+ def __ne__(self, o):\n+ return not self.__eq__(o)\n+ \n+ ## Convert the object into a string\n+ #\n+ # @note used in \'print myObject\'\n+ #\n+ def __str__( self ):\n+ return self.toString()\n+ \n+ ## Read attributes from an Align file\n+ # \n+ # @param fileHandler: file handler of the file being read\n+ # @return: 1 on success, 0 at the end of the file \n+ #\n+ def read(self, fileHandler):\n+ self.reset()\n+ line = fileHandler.readline()\n+ if line == "":\n+ return 0\n+ tokens = line.split("\\t")\n+ if len(tokens) < len(self.__dict__.keys()):\n+ return 0\n+ self.setFromTuple(tokens)\n+ return 1\n+ \n+ ## Set attributes from tuple\n+ #\n+ # @param tuple a tuple with (queryName,queryStart,queryEnd,subjectName,subjectStar,subjectEnd,E-value,score,identity)\n+ # @note data are loaded such that the query is always on the direct strand\n+ #\n+ def setFromTuple( self, tuple ):\n+ #'..b' self.identity = max(self.identity,o.identity)\n+ \n+ ## Return a Map instance with the subject mapped on the query\n+ #\n+ def getSubjectAsMapOfQuery(self):\n+ iMap = Map()\n+ iMap.name = self.range_subject.seqname\n+ iMap.seqname = self.range_query.seqname\n+ if self.range_subject.isOnDirectStrand():\n+ iMap.start = self.range_query.start\n+ iMap.end = self.range_query.end\n+ else:\n+ iMap.start = self.range_query.end\n+ iMap.end = self.range_query.start\n+ return iMap\n+ \n+ ## Return True if query is on direct strand\n+ #\n+ def isQueryOnDirectStrand( self ):\n+ return self.range_query.isOnDirectStrand()\n+ \n+ ## Return True if subject is on direct strand\n+ #\n+ def isSubjectOnDirectStrand( self ):\n+ return self.range_subject.isOnDirectStrand()\n+ \n+ ## Return True if query and subject are on the same strand, False otherwise\n+ #\n+ def areQrySbjOnSameStrand(self):\n+ return self.isQueryOnDirectStrand() == self.isSubjectOnDirectStrand()\n+ \n+ ## Return False if query and subject are on the same strand, True otherwise\n+ #\n+ def areQrySbjOnOppositeStrands(self):\n+ return not self.areQrySbjOnSameStrand()\n+\n+ ## Set attributes from string\n+ #\n+ # @param string a string formatted like queryName queryStart queryEnd subjectName subjectStart subjectEnd E-value score identity\n+ # @param sep field separator\n+ #\n+ def setFromString(self, string, sep="\\t"):\n+ if string[-1] == "\\n":\n+ string = string[:-1]\n+ self.setFromTuple( string.split(sep) )\n+ \n+ ## Return a first Map instance for the query and a second for the subject\n+ #\n+ def getMapsOfQueryAndSubject(self):\n+ iMapQuery = Map( name="repet",\n+ seqname=self.range_query.seqname,\n+ start=self.range_query.start,\n+ end=self.range_query.end )\n+ iMapSubject = Map( name="repet",\n+ seqname=self.range_subject.seqname,\n+ start=self.range_subject.start,\n+ end=self.range_subject.end )\n+ return iMapQuery, iMapSubject\n+ \n+ ## Write query coordinates as Map in a file\n+ #\n+ # @param fileHandler: file handler of the file being filled\n+ #\n+ def writeSubjectAsMapOfQuery( self, fileHandler ):\n+ m = self.getSubjectAsMapOfQuery()\n+ m.write( fileHandler )\n+ \n+ ## Return a bin for fast database access\n+ #\n+ def getBin(self):\n+ return self.range_query.getBin()\n+ \n+ ## Switch query and subject\n+ #\n+ def switchQuerySubject( self ):\n+ tmpRange = self.range_query\n+ self.range_query = self.range_subject\n+ self.range_subject = tmpRange\n+ if not self.isQueryOnDirectStrand():\n+ self.reverse()\n+ \n+ ## Return True if the query overlaps with the query of another Align instance, False otherwise\n+ #\n+ def isQueryOverlapping( self, iAlign ):\n+ return self.getQueryAsRange().isOverlapping( iAlign.getQueryAsRange() )\n+ \n+ ## Return True if the subject overlaps with the subject of another Align instance, False otherwise\n+ #\n+ def isSubjectOverlapping( self, iAlign ):\n+ return self.getSubjectAsRange().isOverlapping( iAlign.getSubjectAsRange() )\n+ \n+ ## Return True if the Align instance overlaps with another Align instance, False otherwise\n+ #\n+ def isOverlapping( self, iAlign ):\n+ if self.isQueryOverlapping( iAlign ) and self.isSubjectOverlapping( iAlign ):\n+ return True\n+ else:\n+ return False\n+ \n+ ## Update the score\n+ #\n+ # @note the new score is the length on the query times the percentage of identity\n+ #\n+ def updateScore( self ):\n+ newScore = self.getLengthOnQuery() * self.getIdentity() / 100.0\n+ self.score = newScore\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/AlignUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/AlignUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,359 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import os\n+import sys\n+import shutil\n+from commons.core.coord.Align import Align\n+\n+\n+## Static methods manipulating Align instances\n+#\n+class AlignUtils( object ):\n+ \n+ ## Return a list with Align instances from the given file\n+ #\n+ # @param inFile name of a file in the Align format\n+ #\n+ def getAlignListFromFile( inFile ):\n+ lAlignInstances = []\n+ inFileHandler = open( inFile, "r" )\n+ while True:\n+ line = inFileHandler.readline()\n+ if line == "":\n+ break\n+ a = Align()\n+ a.setFromString( line )\n+ lAlignInstances.append( a )\n+ inFileHandler.close()\n+ return lAlignInstances\n+\n+ getAlignListFromFile = staticmethod( getAlignListFromFile )\n+ \n+ \n+ ## Return a list with all the scores\n+ #\n+ # @param lAlignInstances: list of Align instances\n+ #\n+ def getListOfScores( lAlignInstances ):\n+ lScores = []\n+ for iAlign in lAlignInstances:\n+ lScores.append( iAlign.score )\n+ return lScores\n+ \n+ getListOfScores = staticmethod( getListOfScores )\n+\n+ \n+ ## Return a list with all the scores from the given file\n+ #\n+ # @param inFile name of a file in the Align format\n+ #\n+ def getScoreListFromFile(inFile):\n+ lScores = []\n+ append = lScores.append\n+ with open(inFile, "r") as inFileHandler:\n+ line = inFileHandler.readline()\n+ while line:\n+ if line != "\\n":\n+ append(int(line.split(\'\\t\')[7]))\n+ line = inFileHandler.readline()\n+ return lScores\n+ \n+ getScoreListFromFile = staticmethod( getScoreListFromFile )\n+ \n+ \n+ ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file\n+ #\n+ # @param alignFile: name of the input Align file\n+ # @param mapFile: name of the output Map file\n+ #\n+ def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ):\n+ alignFileHandler = open( alignFile, "r" )\n+ mapFileHandler = open( mapFile, "w" )\n+ iAlign = Align()\n+ while True:\n+ line = alignFileHandler.readline()\n+ if line == "":\n+ break\n+ iAlign.setFromString( line )\n+ iMapQ, iMap'..b'Dir)\n+ \n+ createAlignFiles = staticmethod( createAlignFiles )\n+ \n+ \n+ ## Return a list with Align instances sorted by query name, subject name, query start, query end and score\n+ #\n+ def sortList( lAligns ):\n+ return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(),\n+ iAlign.getSubjectName(),\n+ iAlign.getQueryStart(),\n+ iAlign.getQueryEnd(),\n+ iAlign.getScore() ) )\n+ \n+ sortList = staticmethod( sortList )\n+ \n+ \n+ ## Return a list after merging all overlapping Align instances\n+ #\n+ def mergeList( lAligns ):\n+ lMerged = []\n+ \n+ lSorted = AlignUtils.sortList( lAligns )\n+ \n+ prev_count = 0\n+ for iAlign in lSorted:\n+ if prev_count != len(lSorted):\n+ for i in lSorted[ prev_count + 1: ]:\n+ if iAlign.isOverlapping( i ):\n+ iAlign.merge( i )\n+ IsAlreadyInList = False\n+ for newAlign in lMerged:\n+ if newAlign.isOverlapping( iAlign ):\n+ IsAlreadyInList = True\n+ newAlign.merge( iAlign )\n+ lMerged [ lMerged.index( newAlign ) ] = newAlign\n+ if not IsAlreadyInList:\n+ lMerged.append( iAlign )\n+ prev_count += 1\n+ \n+ return lMerged\n+ \n+ mergeList = staticmethod( mergeList )\n+ \n+ \n+ ## Merge all Align instance in a given Align file\n+ #\n+ def mergeFile( inFile, outFile="" ):\n+ if outFile == "":\n+ outFile = "%s.merged" % ( inFile )\n+ if os.path.exists( outFile ):\n+ os.remove( outFile )\n+ \n+ tmpFile = "%s.sorted" % ( inFile )\n+ AlignUtils.sortAlignFile( inFile, tmpFile )\n+ \n+ tmpF = open( tmpFile, "r" )\n+ dQrySbj2Aligns = {}\n+ prevPairQrySbj = ""\n+ while True:\n+ line = tmpF.readline()\n+ if line == "":\n+ break\n+ iAlign = Align()\n+ iAlign.setFromString( line )\n+ pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() )\n+ if not dQrySbj2Aligns.has_key( pairQrySbj ):\n+ if prevPairQrySbj != "":\n+ lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] )\n+ AlignUtils.writeListInFile( lMerged, outFile, "a" )\n+ del dQrySbj2Aligns[ prevPairQrySbj ]\n+ prevPairQrySbj = pairQrySbj\n+ else:\n+ prevPairQrySbj = pairQrySbj\n+ dQrySbj2Aligns[ pairQrySbj ] = []\n+ dQrySbj2Aligns[ pairQrySbj ].append( iAlign )\n+ lMerged = []\n+ if len(dQrySbj2Aligns.keys()) > 0:\n+ lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] )\n+ AlignUtils.writeListInFile( lMerged, outFile, "a" )\n+ tmpF.close()\n+ os.remove( tmpFile )\n+ \n+ mergeFile = staticmethod( mergeFile )\n+\n+\n+ ## Update the scores of each match in the input file\n+ #\n+ # @note the new score is the length on the query times the percentage of identity\n+ #\n+ def updateScoresInFile( inFile, outFile ):\n+ inHandler = open( inFile, "r" )\n+ outHandler = open( outFile, "w" )\n+ iAlign = Align()\n+ \n+ while True:\n+ line = inHandler.readline()\n+ if line == "":\n+ break\n+ iAlign.reset()\n+ iAlign.setFromString( line, "\\t" )\n+ iAlign.updateScore()\n+ iAlign.write( outHandler )\n+ \n+ inHandler.close()\n+ outHandler.close()\n+ \n+ updateScoresInFile = staticmethod( updateScoresInFile )\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/ConvCoord.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/ConvCoord.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,504 @@\n+#!/usr/bin/env python\n+\n+##@file\n+# Convert coordinates from chunks to chromosomes or the opposite.\n+#\n+# usage: ConvCoord.py [ options ]\n+# options:\n+# -h: this help\n+# -i: input data with coordinates to convert (file or table)\n+# -f: input data format (default=\'align\'/\'path\')\n+# -c: coordinates to convert (query, subject or both; default=\'q\'/\'s\'/\'qs\')\n+# -m: mapping of chunks on chromosomes (format=\'map\')\n+# -x: convert from chromosomes to chunks (opposite by default)\n+# -o: output data (file or table, same as input)\n+# -C: configuration file (for database connection)\n+# -v: verbosity level (default=0/1/2)\n+\n+\n+import os\n+import sys\n+import getopt\n+import time\n+from commons.core.sql.DbFactory import DbFactory\n+from commons.core.coord.MapUtils import MapUtils\n+from commons.core.sql.TableMapAdaptator import TableMapAdaptator\n+from commons.core.sql.TablePathAdaptator import TablePathAdaptator\n+from commons.core.coord.PathUtils import PathUtils\n+from commons.core.coord.Align import Align\n+from commons.core.coord.Path import Path\n+from commons.core.coord.Range import Range\n+\n+\n+## Class to handle coordinate conversion\n+#\n+class ConvCoord( object ):\n+ \n+ ## Constructor\n+ #\n+ def __init__( self, inData="", mapData="", outData="", configFile="", verbosity=0):\n+ self._inData = inData\n+ self._formatInData = "align"\n+ self._coordToConvert = "q"\n+ self._mapData = mapData\n+ self._mergeChunkOverlaps = True\n+ self._convertChunks = True\n+ self._outData = outData\n+ self._configFile = configFile\n+ self._verbose = verbosity\n+ self._typeInData = "file"\n+ self._typeMapData = "file"\n+ self._tpa = None\n+ if self._configFile != "" and os.path.exists(self._configFile):\n+ self._iDb = DbFactory.createInstance(self._configFile)\n+ else:\n+ self._iDb = DbFactory.createInstance()\n+ \n+ \n+ ## Display the help on stdout\n+ #\n+ def help( self ):\n+ print\n+ print "usage: ConvCoord.py [ options ]"\n+ print "options:"\n+ print " -h: this help"\n+ print " -i: input data with coordinates to convert (file or table)"\n+ print " -f: input data format (default=\'align\'/\'path\')"\n+ print " -c: coordinates to convert (query, subject or both; default=\'q\'/\'s\'/\'qs\')"\n+ print " -m: mapping of chunks on chromosomes (format=\'map\')"\n+ print " -M: merge chunk overlaps (default=yes/no)"\n+ print " -x: convert from chromosomes to chunks (opposite by default)"\n+ print " -o: output data (file or table, same as input)"\n+ print " -C: configuration file (for database connection)"\n+ print " -v: verbosity level (default=0/1/2)"\n+ print\n+ \n+ \n+ ## Set the attributes from the command-line\n+ #\n+ def setAttributesFromCmdLine( self ):\n+ try:\n+ opts, args = getopt.getopt(sys.argv[1:],"hi:f:c:m:M:xo:C:v:")\n+ except getopt.GetoptError, err:\n+ sys.stderr.write( "%s\\n" % ( str(err) ) )\n+ self.help(); sys.exit(1)\n+ for o,a in opts:\n+ if o == "-h":\n+ self.help(); sys.exit(0)\n+ elif o == "-i":\n+ self.setInputData( a )\n+ elif o == "-f":\n+ self.setInputFormat( a )\n+ elif o == "-c":\n+ self.setCoordinatesToConvert( a )\n+ elif o == "-m":\n+ self.setMapData( a )\n+ elif o == "-M":\n+ self.setMergeChunkOverlaps( a )\n+ elif o == "-o":\n+ self.setOutputData( a )\n+ elif o == "-C":\n+ self.setConfigFile( a )\n+ elif o == "-v":\n+ self.setVerbosityLevel( a )\n+ \n+ \n+ def setInputData( self, inData ):\n+ self._inData = inData\n+ '..b'ile( tmpPathTable, tmpPathTable, False )\n+ self._iDb.dropTable( tmpPathTable )\n+ if self._formatInData == "align":\n+ PathUtils.convertPathFileIntoAlignFile( tmpPathTable, outFile )\n+ os.remove( tmpPathTable )\n+ elif self._formatInData == "path":\n+ os.rename( tmpPathTable, outFile )\n+ \n+ \n+ def saveChrCoordsAsTable( self, tmpPathTable, outTable ):\n+ if self._formatInData == "align":\n+ self._iDb.convertPathTableIntoAlignTable( tmpPathTable, outTable )\n+ self._iDb.dropTable( tmpPathTable )\n+ elif self._formatInData == "path":\n+ self._iDb.renameTable( tmpPathTable, outTable )\n+ \n+ \n+ ## Convert coordinates from chunks to chromosomes\n+ #\n+ def convertCoordinatesFromChunksToChromosomes( self ):\n+ dChunks2CoordMaps = self.getChunkCoordsOnChromosomes()\n+ \n+ if self._typeInData == "file":\n+ tmpPathTable = self.convCoordsChkToChrFromFile( self._inData, self._formatInData, dChunks2CoordMaps )\n+ elif self._typeInData == "table":\n+ tmpPathTable = self.convCoordsChkToChrFromTable( self._inData, self._formatInData, dChunks2CoordMaps )\n+ \n+ if self._mergeChunkOverlaps:\n+ self.mergeCoordsOnChunkOverlaps( dChunks2CoordMaps, tmpPathTable );\n+ \n+ if self._typeInData == "file":\n+ self.saveChrCoordsAsFile( tmpPathTable, self._outData )\n+ elif self._typeInData == "table":\n+ self.saveChrCoordsAsTable( tmpPathTable, self._outData )\n+ \n+ \n+ ## Convert coordinates from chromosomes to chunks\n+ #\n+ def convertCoordinatesFromChromosomesToChunks( self ):\n+ msg = "ERROR: convert coordinates from chromosomes to chunks not yet available"\n+ sys.stderr.write( "%s\\n" % ( msg ) )\n+ sys.exit(1)\n+ \n+ \n+ ## Useful commands before running the program\n+ #\n+ def start( self ):\n+ self.checkAttributes()\n+ if self._verbose > 0:\n+ msg = "START ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") )\n+ msg += "\\ninput data: %s" % ( self._inData )\n+ if self._typeInData == "file":\n+ msg += " (file)\\n"\n+ else:\n+ msg += " (table)\\n"\n+ msg += "format: %s\\n" % ( self._formatInData )\n+ msg += "coordinates to convert: %s\\n" % ( self._coordToConvert )\n+ msg += "mapping data: %s" % ( self._mapData )\n+ if self._typeMapData == "file":\n+ msg += " (file)\\n"\n+ else:\n+ msg += " (table)\\n"\n+ if self._mergeChunkOverlaps:\n+ msg += "merge chunk overlaps\\n"\n+ else:\n+ msg += "don\'t merge chunk overlaps\\n"\n+ if self._convertChunks:\n+ msg += "convert chunks to chromosomes\\n"\n+ else:\n+ msg += "convert chromosomes to chunks\\n"\n+ msg += "output data: %s" % ( self._outData )\n+ if self._typeInData == "file":\n+ msg += " (file)\\n"\n+ else:\n+ msg += " (table)\\n"\n+ sys.stdout.write( msg )\n+ \n+ \n+ ## Useful commands before ending the program\n+ #\n+ def end( self ):\n+ self._iDb.close()\n+ if self._verbose > 0:\n+ msg = "END ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") )\n+ sys.stdout.write( "%s\\n" % ( msg ) )\n+ \n+ \n+ ## Run the program\n+ #\n+ def run( self ):\n+ self.start()\n+ \n+ if self._convertChunks:\n+ self.convertCoordinatesFromChunksToChromosomes()\n+ else:\n+ self.convertCoordinatesFromChromosomesToChunks()\n+ \n+ self.end()\n+ \n+ \n+if __name__ == "__main__":\n+ i = ConvCoord()\n+ i.setAttributesFromCmdLine()\n+ i.run()\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/Map.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Map.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,161 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Range import Range + + +## Record a named region on a given sequence +# +class Map( Range ): + + ## Constructor + # + # @param name the name of the region + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, name="", seqname="", start=-1, end=-1): + self.name = name + Range.__init__( self, seqname, start, end ) + + ## Equal operator + # + # @param o a Map instance + # + def __eq__(self, o): + if self.name == o.name: + return Range.__eq__(self, o) + return False + + ## Return name + # + def getName( self ): + return self.name + + ## Set attributes from tuple + # + # @param tuple: a tuple with (name,seqname,start,end) + # + def setFromTuple(self, tuple): + self.name = tuple[0] + Range.setFromTuple(self, tuple[1:]) + + ## Set attributes from string + # + # @param string a string formatted like name<sep>seqname<sep>start<sep>end + # @param sep field separator + # + def setFromString(self, string, sep="\t"): + if string[-1] == "\n": + string = string[:-1] + self.setFromTuple( string.split(sep) ) + + ## Reset + # + def reset(self): + self.setFromTuple( [ "", "", -1, -1 ] ) + + ## Read attributes from a Map file + # + # @param fileHandler: file handler of the file being read + # @return: 1 on success, 0 at the end of the file + # + def read(self, fileHandler): + self.reset() + line = fileHandler.readline() + if line == "": + return 0 + tokens = line.split("\t") + if len(tokens) < len(self.__dict__.keys()): + return 0 + self.setFromTuple(tokens) + return 1 + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%s" % (self.name) + string += "\t%s" % (Range.toString(self)) + return string + + ## Write attributes into a Map file + # + # @param fileHandler: file handler of the file being filled + # + def write(self, fileHandler): + fileHandler.write("%s\n" % (self.toString())) + + ## Save attributes into a Map file + # + # @param file: name of the file being filled + # + def save(self, file): + fileHandler = open( file, "a" ) + self.write( fileHandler ) + fileHandler.close() + + ## Return a Range instance with the attributes + # + def getRange(self): + return Range( self.seqname, self.start, self.end) + + ## Remove in the instance the region overlapping with another Map instance + # + # @param o a Map instance + # + def diff(self, o): + iRange = Range.diff(self, o.getRange()) + new = Map() + if not iRange.isEmpty(): + new.name = self.name + new.seqname = self.seqname + new.start = iRange.start + new.end = iRange.end + return new + + ## Write attributes in a Path file, the name being the subject and the rest the Range query + # + # @param fileHandler: file handler of a Path file + # + def writeAsQueryOfPath(self, fileHandler): + string = "0" + string += "\t%s" % ( self.seqname ) + string += "\t%i" % ( self.getMin() ) + string += "\t%i" % ( self.getMax() ) + string += "\t%s" % ( self.name ) + string += "\t0" + string += "\t0" + string += "\t0.0" + string += "\t0" + string += "\t0" + fileHandler.write( "%s\n" % ( string ) ) + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/MapUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MapUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,246 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import sys\n+import os\n+from commons.core.coord.Map import Map\n+from commons.core.coord.Set import Set\n+try:\n+ from commons.core.checker.CheckerUtils import CheckerUtils\n+except ImportError:\n+ pass\n+\n+\n+## static methods manipulating Map instances\n+#\n+class MapUtils( object ):\n+ \n+ ## Return a list of Map instances sorted in increasing order according to the min, then the max, and finally their initial order\n+ #\n+ # @param lMaps list of Map instances\n+ #\n+ def getMapListSortedByIncreasingMinThenMax( lMaps ):\n+ return sorted( lMaps, key=lambda iMap: ( iMap.getMin(), iMap.getMax() ) ) \n+ \n+ getMapListSortedByIncreasingMinThenMax = staticmethod( getMapListSortedByIncreasingMinThenMax )\n+ \n+ \n+ ## Return a list of Map instances sorted in increasing order according to the name, then the seqname, then the min, then the max\n+ #\n+ # @param lMaps list of Map instances\n+ #\n+ def getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax( lMaps ):\n+ return sorted( lMaps, key=lambda iMap: ( iMap.getName(), iMap.getSeqname(), iMap.getMin(), iMap.getMax() ) ) \n+ \n+ getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax = staticmethod( getMapListSortedByIncreasingNameThenSeqnameThenMinThenMax )\n+ \n+ \n+ ## Return a dictionary which keys are Map names and values the corresponding Map instances\n+ #\n+ def getDictPerNameFromMapFile( mapFile ):\n+ dName2Maps = {}\n+ mapFileHandler = open( mapFile, "r" )\n+ while True:\n+ line = mapFileHandler.readline()\n+ if line == "":\n+ break\n+ iMap = Map()\n+ iMap.setFromString( line, "\\t" )\n+ if dName2Maps.has_key( iMap.name ):\n+ if iMap == dName2Maps[ iMap.name ]:\n+ continue\n+ else:\n+ msg = "ERROR: in file \'%s\' two different Map instances have the same name \'%s\'" % ( mapFile, iMap.name )\n+ sys.stderr.write( "%s\\n" % ( msg ) )\n+ sys.exit(1)\n+ dName2Maps[ iMap.name ] = iMap\n+ mapFileHandler.close()\n+ return dName2Maps\n+ \n+ getDictPerNameFromMapFile = staticmethod( getDictPerNameFromMapFile )\n+\n+ \n+ ## Give a list of Set instances from a list of Map instances\n+ #\n+ # @param lMaps list of Map '..b's.rename( "%s.merge" % inFile,\n+ outFile )\n+ \n+ mergeCoordsInFile = staticmethod( mergeCoordsInFile )\n+ \n+ \n+ ## Return a dictionary which keys are Map seqnames and values the corresponding Map instances\n+ #\n+ def getDictPerSeqNameFromMapFile( mapFile ):\n+ dSeqName2Maps = {}\n+ mapFileHandler = open( mapFile, "r" )\n+ while True:\n+ line = mapFileHandler.readline()\n+ if line == "":\n+ break\n+ iMap = Map()\n+ iMap.setFromString( line, "\\t" )\n+ if not dSeqName2Maps.has_key( iMap.seqname ):\n+ dSeqName2Maps[ iMap.seqname ] = []\n+ dSeqName2Maps[ iMap.seqname ].append( iMap )\n+ mapFileHandler.close()\n+ return dSeqName2Maps\n+ \n+ getDictPerSeqNameFromMapFile = staticmethod( getDictPerSeqNameFromMapFile )\n+ \n+ \n+ ## Convert an Map file into a Set file\n+ #\n+ # @param mapFile string input map file name\n+ # @param setFile string output set file name\n+ #\n+ def convertMapFileIntoSetFile( mapFileName, setFileName = "" ):\n+ if setFileName == "":\n+ setFileName = "%s.set" % mapFileName\n+ mapFileHandler = open( mapFileName, "r" )\n+ setFileHandler = open( setFileName, "w" )\n+ iMap = Map()\n+ count = 0\n+ while True:\n+ line = mapFileHandler.readline()\n+ if line == "":\n+ break\n+ iMap.setFromString(line)\n+ count += 1\n+ iSet = Set()\n+ iSet.id = count\n+ iSet.name = iMap.getName()\n+ iSet.seqname = iMap.getSeqname()\n+ iSet.start = iMap.getStart()\n+ iSet.end = iMap.getEnd()\n+ iSet.write(setFileHandler)\n+ mapFileHandler.close()\n+ setFileHandler.close()\n+ \n+ convertMapFileIntoSetFile = staticmethod( convertMapFileIntoSetFile )\n+ \n+ ## Write Map instances contained in the given list\n+ #\n+ # @param lMaps list of Map instances\n+ # @param fileName a file name\n+ # @param mode the open mode of the file \'"w"\' or \'"a"\' \n+ #\n+ def writeListInFile(lMaps, fileName, mode="w"):\n+ fileHandler = open(fileName, mode)\n+ for iMap in lMaps:\n+ iMap.write(fileHandler)\n+ fileHandler.close()\n+ \n+ writeListInFile = staticmethod( writeListInFile )\n+\n+ \n+ ## Get the length of the shorter seq in map file\n+ #\n+ # @param mapFileName\n+ # @param mode the open mode of the file \'"w"\' or \'"a"\' \n+ #\n+ def getMinLengthOfMapFile(self, mapFileName):\n+ fileHandler = open(mapFileName, "r")\n+ line = fileHandler.readline()\n+ start = int (line.split(\'\\t\')[2])\n+ end = int (line.split(\'\\t\')[3])\n+ min = end - start + 1\n+ while True:\n+ line = fileHandler.readline()\n+ if line == "":\n+ break\n+ start = int (line.split(\'\\t\')[2])\n+ end = int (line.split(\'\\t\')[3])\n+ currentMin = end - start + 1\n+ if min >= currentMin:\n+ min = currentMin\n+ fileHandler.close()\n+ return min\n+\n+ ## Get the max length of the shorter seq in map file\n+ #\n+ # @param mapFileName\n+ # @param mode the open mode of the file \'"w"\' or \'"a"\' \n+ #\n+ def getMaxLengthOfMapFile(self, mapFileName):\n+ fileHandler = open(mapFileName, "r")\n+ line = fileHandler.readline()\n+ start = int (line.split(\'\\t\')[2])\n+ end = int (line.split(\'\\t\')[3])\n+ max = end - start + 1\n+ while True:\n+ line = fileHandler.readline()\n+ if line == "":\n+ break\n+ start = int (line.split(\'\\t\')[2])\n+ end = int (line.split(\'\\t\')[3])\n+ currentMax = end - start + 1\n+ if max <= currentMax:\n+ max = currentMax\n+ fileHandler.close()\n+ return max\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/Match.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Match.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,206 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import sys\n+from commons.core.coord.Range import Range\n+from commons.core.coord.Path import Path\n+\n+\n+## Handle a chain of match(es) between two sequences, query and subject, with an identifier and the length of the input sequences\n+#\n+class Match( Path ):\n+ \n+ ## Constructor\n+ #\n+ def __init__(self):\n+ Path.__init__(self)\n+ self.query_length = -1\n+ self.query_length_perc = -1 # length of the match on the query / length of the query\n+ self.query_seqlength = -1\n+ self.match_length_perc = -1 # length of the match on the query / total length of the subject\n+ self.subject_length = -1\n+ self.subject_length_perc = -1 # length of the match on the subject / length of the subject\n+ self.subject_seqlength = -1\n+ \n+ ## Equal operator\n+ #\n+ def __eq__(self, o):\n+ if o == None \\\n+ or self.query_length != o.query_length or self.query_length_perc != o.query_length_perc\\\n+ or self.query_seqlength != o.query_seqlength or self.subject_length != o.subject_length\\\n+ or self.subject_length_perc != o.subject_length_perc or self.subject_seqlength != o.subject_seqlength\\\n+ or self.match_length_perc != o.match_length_perc:\n+ return False\n+ return Path.__eq__(self, o)\n+ \n+ ## Return the length of the match on the query divided by the total length of the query\n+ #\n+ def getLengthPercOnQuery(self):\n+ return self.query_length_perc\n+ \n+ ## Return the length of the match on the subject divided by the total length of the subject\n+ #\n+ def getLengthPercOnSubject(self):\n+ return self.subject_length_perc\n+ \n+ ## Return the length of the match on the subject\n+ #\n+ def getLengthMatchOnSubject(self):\n+ return self.subject_length\n+ \n+ ## Set attributes from a tuple\n+ # \n+ # @param tuple: a tuple with (query name,query start,query end,\n+ # query length, query length perc (between 0-1), match length perc (between 0-1), subject name,\n+ # subject start,subject end,subject length, subject length percentage (between 0-1), e_value,score,identity,id)\n+ #\n+ def setFromTuple( self, tuple ):\n+ queryStart = int(tuple[1])\n+ queryEnd = int(tuple[2])\n+ subjectStart = int(tuple[7])\n+ subjectEnd = int(tuple[8])\n+ if quer'..b'gth = -1\n+ self.match_length_perc = -1\n+ self.subject_length = -1\n+ self.subject_length_perc = -1\n+ self.subject_seqlength = -1\n+ \n+ ## Return a formated string of the attribute data\n+ # \n+ def toString( self ):\n+ string = "%s" % ( self.range_query.toString() )\n+ string += "\\t%i\\t%f" % ( self.query_length,\n+ self.query_length_perc )\n+ string += "\\t%f" % ( self.match_length_perc )\n+ string += "\\t%s" % ( self.range_subject.toString() )\n+ string += "\\t%i\\t%f" % ( self.subject_length,\n+ self.subject_length_perc )\n+ string += "\\t%g\\t%i\\t%f" % ( self.e_value,\n+ self.score,\n+ self.identity )\n+ string += "\\t%i" % ( self.id )\n+ return string\n+ \n+ ## Return a Path instance\n+ #\n+ def getPathInstance( self ):\n+ p = Path()\n+ tuple = ( self.id,\n+ self.range_query.seqname,\n+ self.range_query.start,\n+ self.range_query.end,\n+ self.range_subject.seqname,\n+ self.range_subject.start,\n+ self.range_subject.end,\n+ self.e_value,\n+ self.score,\n+ self.identity )\n+ p.setFromTuple( tuple )\n+ return p\n+ \n+ ## Give information about a match whose query is included in the subject\n+ # \n+ # @return string\n+ #\n+ def getQryIsIncluded( self ):\n+ string = "query %s (%d bp: %d-%d) is contained in subject %s (%d bp: %d-%d): id=%.2f - %.3f - %.3f - %.3f" %\\\n+ ( self.range_query.seqname, self.query_seqlength, self.range_query.start, self.range_query.end,\n+ self.range_subject.seqname, self.subject_seqlength, self.range_subject.start, self.range_subject.end,\n+ self.identity, self.query_length_perc, self.match_length_perc, self.subject_length_perc )\n+ return string\n+ \n+ def increaseLengthPercOnQuery(self, coverage):\n+ self.query_length_perc += coverage\n+ \n+ ## Compare the object with another match and see if they are equal\n+ # (same identity, E-value and score + same subsequences whether in query or subject)\n+ #\n+ # @return True if objects are equals False otherwise\n+ #\n+ def isDoublonWith( self, match, verbose=0 ):\n+\n+ # if both matches have same identity, score and E-value\n+ if self.identity == match.identity and self.score == match.score and self.e_value == match.e_value:\n+\n+ # if query and subject are identical\n+ if ( self.range_query.seqname == match.range_query.seqname \\\n+ and self.range_subject.seqname == match.range_subject.seqname ):\n+\n+ # if the coordinates are equal\n+ if self.range_query.__eq__( match.range_query ) and self.range_subject.__eq__( match.range_subject ):\n+ return True\n+\n+ else:\n+ if verbose > 0: print "different coordinates"; sys.stdout.flush()\n+ return False\n+\n+ # if query and subject are reversed but identical\n+ elif self.range_query.seqname == match.range_subject.seqname and self.range_subject.seqname == match.range_query.seqname:\n+\n+ # if the coordinates are equal\n+ if self.range_query.__eq__( match.range_subject ) and self.range_subject.__eq__( match.range_query ):\n+ return True\n+\n+ else:\n+ if verbose > 0: print "different coordinates"; sys.stdout.flush()\n+ return False\n+\n+ else:\n+ if verbose > 0: print "different sequence names"; sys.stdout.flush()\n+ return False\n+\n+ else:\n+ if verbose > 0: print "different match numbers"; sys.stdout.flush()\n+ return False\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/MatchUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MatchUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,288 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+import math\n+import os\n+import sys\n+from commons.core.coord.Match import Match\n+from commons.core.checker.RepetException import RepetException\n+\n+## Static methods for the manipulation of Match instances\n+#\n+class MatchUtils ( object ):\n+ \n+ ## Return a list with Match instances from the given file\n+ #\n+ # @param inFile name of a file in the Match format\n+ # @return a list of Match instances\n+ #\n+ def getMatchListFromFile(inFile ):\n+ lMatchInstances = []\n+ inFileHandler = open( inFile, "r" )\n+ while True:\n+ line = inFileHandler.readline()\n+ if line == "":\n+ break\n+ if line[0:10] == "query.name":\n+ continue\n+ m = Match()\n+ m.setFromString( line )\n+ lMatchInstances.append( m )\n+ inFileHandler.close()\n+ return lMatchInstances\n+ \n+ getMatchListFromFile = staticmethod( getMatchListFromFile )\n+ \n+ ## Split a Match list in several Match lists according to the subject\n+ #\n+ # @param lMatches a list of Match instances\n+ # @return a dictionary which keys are subject names and values Match lists\n+ #\n+ def getDictOfListsWithSubjectAsKey( lMatches ):\n+ dSubject2MatchList = {}\n+ for iMatch in lMatches:\n+ if not dSubject2MatchList.has_key( iMatch.range_subject.seqname ):\n+ dSubject2MatchList[ iMatch.range_subject.seqname ] = []\n+ dSubject2MatchList[ iMatch.range_subject.seqname ].append( iMatch )\n+ return dSubject2MatchList\n+ \n+ getDictOfListsWithSubjectAsKey = staticmethod( getDictOfListsWithSubjectAsKey )\n+ \n+ ## Split a Match list in several Match lists according to the query\n+ #\n+ # @param lMatches a list of Match instances\n+ # @return a dictionary which keys are query names and values Match lists\n+ #\n+ def getDictOfListsWithQueryAsKey ( lMatches ):\n+ dQuery2MatchList = {}\n+ for iMatch in lMatches:\n+ if not dQuery2MatchList.has_key( iMatch.range_query.seqname ):\n+ dQuery2MatchList[ iMatch.range_query.seqname ] = []\n+ dQuery2MatchList[ iMatch.range_query.seqname ].append( iMatch )\n+ return dQuery2MatchList\n+ \n+ getDictOfListsWithQueryAsKey = staticmethod( getDictOfListsWithQueryAsKey ) \n+ \n+ ## Write M'..b' else:\n+ dMatches = MatchUtils.getDictOfListsWithSubjectAsKey(lMatches)\n+ \n+ for qry in dMatches.keys():\n+ countMatch = 0\n+ for match in dMatches[ qry ]:\n+ \n+ if match.identity >= thresIdentityPerc and getattr(match,whatToCount.lower() +"_length_perc") >= thresLength:\n+ countMatch += 1\n+ if countMatch > 0:\n+ countSbj += 1\n+ return countSbj\n+ \n+ getNbDistinctSequencesInsideMatchesWithThresh = staticmethod(getNbDistinctSequencesInsideMatchesWithThresh)\n+ \n+ ## Convert a \'match\' file (output from Matcher) into an \'align\' file\n+ ## replace old parser.tab2align\n+ #\n+ # @param inFileName a string input file name\n+ #\n+ def convertMatchFileToAlignFile(inFileName):\n+ basename = os.path.splitext(inFileName)[0]\n+ outFileName = "%s.align" % basename\n+ outFile = open(outFileName, "w")\n+ \n+ lMatches = MatchUtils.getMatchListFromFile(inFileName) \n+ \n+ for match in lMatches:\n+ string = "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" % ( match.getQueryName(), match.getQueryStart(), match.getQueryEnd(), match.getSubjectName(), match.getSubjectStart(), match.getSubjectEnd(), match.getEvalue(), match.getScore(), match.getIdentity() )\n+ outFile.write( string )\n+ \n+ outFile.close()\n+ \n+ convertMatchFileToAlignFile = staticmethod(convertMatchFileToAlignFile)\n+ \n+ ## Convert a \'match\' file (output from Matcher) into an \'abc\' file (MCL input file)\n+ # Use coverage on query for arc value\n+ #\n+ # @param matchFileName string input match file name\n+ # @param outFileName string output abc file name\n+ # @param coverage float query coverage filter threshold\n+ #\n+ @staticmethod\n+ def convertMatchFileIntoABCFileOnQueryCoverage(matchFileName, outFileName, coverage = 0):\n+ with open(matchFileName) as inF:\n+ with open(outFileName, "w") as outF:\n+ inF.readline()\n+ inLine = inF.readline()\n+ while inLine:\n+ splittedLine = inLine.split("\\t")\n+ if float(splittedLine[4]) >= coverage:\n+ outLine = "\\t".join([splittedLine[0], splittedLine[6], splittedLine[4]])\n+ outLine += "\\n"\n+ outF.write(outLine)\n+ inLine = inF.readline()\n+\n+ ## Adapt the path IDs as the input file is the concatenation of several \'Match\' files, and remove the extra header lines. \n+ ## replace old parser.tabnum2id\n+ #\n+ # @param fileName a string input file name\n+ # @param outputFileName a string output file name (optional)\n+ #\n+ def generateMatchFileWithNewPathId(fileName, outputFileName=None):\n+ if outputFileName is None: \n+ outFile = open(fileName, "w")\n+ else:\n+ outFile = open(outputFileName, "w") \n+ outFile.write("query.name\\tquery.start\\tquery.end\\tquery.length\\tquery.length.%\\tmatch.length.%\\tsubject.name\\tsubject.start\\tsubject.end\\tsubject.length\\tsubject.length.%\\tE.value\\tScore\\tIdentity\\tpath\\n")\n+ \n+ lMatches = MatchUtils.getMatchListFromFile(fileName) \n+ count = 1\n+ dMatchKeyIdcount = {}\n+ \n+ for match in lMatches:\n+ key_id = str(match.getIdentifier()) + "-" + match.getQueryName() + "-" + match.getSubjectName()\n+ if not key_id in dMatchKeyIdcount.keys():\n+ newPath = count\n+ count += 1\n+ dMatchKeyIdcount[ key_id ] = newPath\n+ else:\n+ newPath = dMatchKeyIdcount[ key_id ]\n+ \n+ match.id = newPath\n+ outFile.write( match.toString()+"\\n" ) \n+ outFile.close()\n+ \n+ generateMatchFileWithNewPathId = staticmethod(generateMatchFileWithNewPathId)\n+ \n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/MergedRange.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/MergedRange.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,98 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +## Record a region on multiple sequence using Path ID information +# +class MergedRange(object): + + ## Constructor + # + # @param lId list of Path ID + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, lId = None, start = -1, end = -1): + self._lId = lId or [] + self._start = start + self._end = end + + ## Equal operator + # + # @param o a MergedRange instance + # + def __eq__(self, o): + return o._lId == self._lId and o._start == self._start and o._end == self._end + + + ## Return True if the MergedRange instance overlaps with another MergedRange instance, False otherwise + # + # @param o a MergedRange instance + # @return boolean False or True + # + def isOverlapping(self, o): + if o._start <= self._start and o._end >= self._end: + return True + if o._start >= self._start and o._start <= self._end or o._end >= self._start and o._end <= self._end: + return True + return False + + ## Merge coordinates and ID of two Merged Range + # + # @param o a MergedRange instance + # + def merge(self, o): + self._start = min(self._start, o._start) + self._end = max(self._end, o._end) + self._lId.extend(o._lId) + self._lId.sort() + + ## Set a Merged Range instance using a Match instance + # + # @param iMatch instance Match instance + # + def setFromMatch(self, iMatch): + self._lId= [iMatch.id] + self._start = iMatch.range_query.start + self._end = iMatch.range_query.end + + ## Get a Merged Range instance list using a Match instance list + # + # @param lIMatch list Match instance list + # @return lMergedRange list MergedRange instance list + # + def getMergedRangeListFromMatchList(lIMatch): + lMergedRange = [] + for iMatch in lIMatch: + mr = MergedRange() + mr.setFromMatch(iMatch) + lMergedRange.append(mr) + return lMergedRange + + getMergedRangeListFromMatchList = staticmethod(getMergedRangeListFromMatchList) \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/Path.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Path.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,149 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Align import Align +from commons.core.coord.Set import Set +from commons.core.coord.Range import Range + + +## Handle a match between two sequences, query and subject (pair of coordinates with E-value, score and identity) with an identifier +# +class Path( Align ): + + ## Constructor + # + # @param id identifier + # @param range_q: a Range instance for the query + # @param range_s: a Range instance for the subject + # @param e_value: E-value of the match + # @param score: score of the match + # @param identity: identity percentage of the match + # + def __init__( self, id=-1, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0 ): + self.id = int( id ) + Align.__init__( self, range_q, range_s, e_value, score, identity ) + + ## Equal operator + # + def __eq__(self, o): + if o == None or self.id != o.id: + return False + return Align.__eq__(self, o) + + ## Set attributes from tuple + # + # @param tuple a tuple with (id,queryName,queryStart,queryEnd,subjectName,subjectStar,subjectEnd,E-value,score,identity) + # @note data are loaded such that the query is always on the direct strand + # + def setFromTuple(self, tuple): + self.id = int(tuple[0]) + Align.setFromTuple(self, tuple[1:]) + + ## Reset + # + def reset(self): + self.id = -1 + Align.reset(self) + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%i" % ( self.id ) + string += "\t%s" % (Align.toString(self)) + return string + + + ## Return the identifier of the Path instance + # + def getIdentifier( self ): + return self.id + + ## Return a Set instance with the subject mapped on the query + # + def getSubjectAsSetOfQuery(self): + iSet = Set() + iSet.id = self.id + iSet.name = self.range_subject.seqname + iSet.seqname = self.range_query.seqname + if self.range_subject.isOnDirectStrand(): + iSet.start = self.range_query.start + iSet.end = self.range_query.end + else: + iSet.start = self.range_query.end + iSet.end = self.range_query.start + return iSet + + #TODO: add tests !!!! + #WARNING: subject always in direct strand !!! + ## Return a Set instance with the subject mapped on the query + # + def getQuerySetOfSubject(self): + iSet = Set() + iSet.id = self.id + iSet.name = self.range_query.seqname + iSet.seqname = self.range_subject.seqname + if self.range_subject.isOnDirectStrand(): + iSet.start = self.range_subject.start + iSet.end = self.range_subject.end + else: + iSet.start = self.range_subject.end + iSet.end = self.range_subject.start + return iSet + + ## Return True if the instance can be merged with another Path instance, False otherwise + # + # @param o a Path instance + # + def canMerge(self, o): + return o.id != self.id \ + and o.range_query.seqname == self.range_query.seqname \ + and o.range_subject.seqname == self.range_subject.seqname \ + and o.range_query.isOnDirectStrand() == self.range_query.isOnDirectStrand() \ + and o.range_subject.isOnDirectStrand() == self.range_subject.isOnDirectStrand() \ + and o.range_query.isOverlapping(self.range_query) \ + and o.range_subject.isOverlapping(self.range_subject) + + ## Return an Align instance with the same attributes, except the identifier + # + def getAlignInstance(self): + iAlign = Align() + lAttributes = [] + lAttributes.append( self.range_query.seqname ) + lAttributes.append( self.range_query.start ) + lAttributes.append( self.range_query.end ) + lAttributes.append( self.range_subject.seqname ) + lAttributes.append( self.range_subject.start ) + lAttributes.append( self.range_subject.end ) + lAttributes.append( self.e_value ) + lAttributes.append( self.score ) + lAttributes.append( self.identity ) + iAlign.setFromTuple( lAttributes ) + return iAlign |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/PathUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/PathUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,858 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import os\n+import sys\n+import copy\n+from commons.core.coord.Path import Path\n+from commons.core.coord.SetUtils import SetUtils\n+from commons.core.coord.Map import Map\n+from commons.core.coord.AlignUtils import AlignUtils\n+from commons.core.checker.RepetException import RepetDataException\n+\n+## Static methods for the manipulation of Path instances\n+#\n+class PathUtils ( object ):\n+ \n+ ## Change the identifier of each Set instance in the given list\n+ #\n+ # @param lPaths list of Path instances\n+ # @param newId new identifier\n+ #\n+ def changeIdInList(lPaths, newId):\n+ for iPath in lPaths:\n+ iPath.id = newId\n+ \n+ changeIdInList = staticmethod( changeIdInList )\n+ \n+ \n+ ## Return a list of Set instances containing the query range from a list of Path instances\n+ # \n+ # @param lPaths a list of Path instances\n+ # \n+ def getSetListFromQueries(lPaths):\n+ lSets = []\n+ for iPath in lPaths:\n+ lSets.append( iPath.getSubjectAsSetOfQuery() )\n+ return lSets\n+ \n+ getSetListFromQueries = staticmethod( getSetListFromQueries )\n+ \n+ #TODO: add tests !!!!\n+ ## Return a list of Set instances containing the query range from a list of Path instances\n+ # \n+ # @param lPaths a list of Path instances\n+ #\n+ @staticmethod\n+ def getSetListFromSubjects(lPaths):\n+ lSets = []\n+ for iPath in lPaths:\n+ lSets.append( iPath.getQuerySetOfSubject() )\n+ return lSets\n+ \n+ \n+ ## Return a sorted list of Range instances containing the subjects from a list of Path instances\n+ # \n+ # @param lPaths a list of Path instances\n+ # @note meaningful only if all Path instances have same identifier\n+ #\n+ def getRangeListFromSubjects( lPaths ):\n+ lRanges = []\n+ for iPath in lPaths:\n+ lRanges.append( iPath.range_subject )\n+ if lRanges[0].isOnDirectStrand():\n+ return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) )\n+ else:\n+ return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) )\n+ \n+ getRangeListFromSubjects = staticmethod( getRangeListFromSubjects )\n+ \n+ \n+ ## Return a tuple with min and max of query coordinates from Path instances in the given list\n+ #\n+ # @param '..b'te the \'path\' query is supposed to correspond to the \'gff\' first column\n+ #\n+ def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ):\n+ dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile )\n+ if verbose > 0:\n+ msg = "number of chains: %i" % ( len(dId2PathList.keys()) )\n+ sys.stdout.write( "%s\\n" % msg )\n+ sys.stdout.flush()\n+ gffFileHandler = open( gffFile, "w" )\n+ for id in dId2PathList.keys():\n+ if len( dId2PathList[ id ] ) == 1:\n+ iPath = dId2PathList[ id ][0]\n+ string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(),\n+ source=source )\n+ gffFileHandler.write( "%s\\n" % string )\n+ else:\n+ iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] )\n+ string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(),\n+ source=source )\n+ gffFileHandler.write( "%s\\n" % string )\n+ count = 0\n+ for iPath in dId2PathList[ id ]:\n+ count += 1\n+ string = iPath.toStringAsGff( type="match_part",\n+ ID="mp%i-%i" % ( iPath.getIdentifier(), count ),\n+ Parent="ms%i" % iPathrange.getIdentifier(),\n+ source=source )\n+ gffFileHandler.write( "%s\\n" % string )\n+ gffFileHandler.close()\n+ \n+ convertPathFileIntoGffFile = staticmethod( convertPathFileIntoGffFile )\n+ \n+ \n+ ## Convert a Path file into a Set file\n+ # replace old parser.pathrange2set\n+ # @param pathFile: name of the input Path file\n+ # @param setFile: name of the output Set file\n+ #\n+ def convertPathFileIntoSetFile( pathFile, setFile ):\n+ pathFileHandler = open( pathFile, "r" )\n+ setFileHandler = open( setFile, "w" )\n+ iPath = Path()\n+ while True:\n+ line = pathFileHandler.readline()\n+ if line == "":\n+ break\n+ iPath.setFromString( line )\n+ iSet = iPath.getSubjectAsSetOfQuery()\n+ iSet.write( setFileHandler )\n+ pathFileHandler.close()\n+ setFileHandler.close()\n+ \n+ convertPathFileIntoSetFile = staticmethod( convertPathFileIntoSetFile )\n+ \n+ ## Write Path File without duplicated Path (same query, same subject and same coordinate)\n+ #\n+ # @param inputFile: name of the input Path file\n+ # @param outputFile: name of the output Path file\n+ #\n+ def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile):\n+ f = open(inputFile, "r")\n+ line = f.readline()\n+ previousQuery = ""\n+ previousSubject = ""\n+ lPaths = []\n+ while line:\n+ iPath = Path()\n+ iPath.setFromString(line)\n+ query = iPath.getQueryName()\n+ subject = iPath.getSubjectName()\n+ if (query != previousQuery or subject != previousSubject) and lPaths != []: \n+ lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)\n+ PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")\n+ lPaths = []\n+ lPaths.append(iPath)\n+ previousQuery = query\n+ previousSubject = subject\n+ line = f.readline()\n+ lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)\n+ PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")\n+ f.close()\n+ removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName = staticmethod(removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName)\n+ \n+ \n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/Range.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Range.py Tue Apr 30 15:02:29 2013 -0400 |
b |
b'@@ -0,0 +1,361 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+## Record a region on a given sequence\n+#\n+class Range( object ):\n+\n+ ## Constructor\n+ #\n+ # @param seqname the name of the sequence\n+ # @param start the start coordinate\n+ # @param end the end coordinate\n+ #\n+ def __init__(self, seqname="", start=-1, end=-1):\n+ self.seqname = seqname\n+ self.start = int(start)\n+ self.end = int(end)\n+ \n+ ## Equal operator\n+ #\n+ # @param o a Range instance\n+ #\n+ def __eq__(self, o):\n+ if self.seqname == o.seqname and self.start == o.start and self.end == o.end:\n+ return True\n+ return False\n+ \n+ ## Unequal operator\n+ #\n+ # @param o a Range instance\n+ #\n+ def __ne__(self, o):\n+ return not self.__eq__(o)\n+ \n+ ## Convert the object into a string\n+ #\n+ # @note used in \'print myObject\'\n+ #\n+ def __str__( self ):\n+ return self.toString()\n+ \n+ ## Convert the object into a string\n+ #\n+ # @note used in \'repr(myObject)\' for debugging\n+ #\n+ def __repr__( self ):\n+ return self.toString().replace("\\t",";")\n+ \n+ def setStart(self, start):\n+ self.start = start\n+ \n+ def setEnd(self, end):\n+ self.end = end\n+ \n+ def setSeqName(self, seqName):\n+ self.seqname = seqName\n+ \n+ ## Reset\n+ #\n+ def reset(self):\n+ self.seqname = ""\n+ self.start = -1\n+ self.end = -1\n+ \n+ ## Return the attributes as a formatted string\n+ # \n+ def toString(self):\n+ string = "%s" % (self.seqname)\n+ string += "\\t%d" % (self.start)\n+ string += "\\t%d" % (self.end)\n+ return string\n+ \n+ ## Show the attributes\n+ #\n+ def show(self):\n+ print self.toString()\n+ \n+ ## Return seqname\n+ #\n+ def getSeqname(self):\n+ return self.seqname\n+ \n+ ## Return the start coordinate\n+ #\n+ def getStart(self):\n+ return self.start\n+ \n+ ## Return the end coordinate\n+ #\n+ def getEnd(self):\n+ return self.end\n+ \n+ ## Return the lowest value between start and end coordinates\n+ #\n+ def getMin(self):\n+ return min(self.start, self.end)\n+ \n+ ## Return the greatest value between start and end attributes\n+ # \n+ def getMax(self):\n+ return max(self.start, self.end)\n+ \n+ ## Return Tr'..b', o ):\n+ if o.seqname != self.seqname:\n+ return False\n+ if self.getMin() >= o.getMin() and self.getMax() <= o.getMax():\n+ return True\n+ else:\n+ return False\n+\n+ \n+ ## Return the distance between the start of the instance and the start of another Range instance\n+ #\n+ # @param o a Range instance\n+ #\n+ def getDistance(self, o):\n+ if self.isOnDirectStrand() == o.isOnDirectStrand():\n+ if self.isOverlapping(o):\n+ return 0\n+ elif self.isOnDirectStrand():\n+ if self.start > o.start:\n+ return self.start - o.end\n+ else:\n+ return o.start - self.end\n+ else:\n+ if self.start > o.start:\n+ return self.end - o.start\n+ else:\n+ return o.end - self.start\n+ return -1\n+ \n+ ## Remove in the instance the region overlapping with another Range instance\n+ #\n+ # @param o a Range instance\n+ # \n+ def diff(self, o):\n+ new_range = Range(self.seqname)\n+ if not self.isOverlapping(o) or self.seqname != o.seqname:\n+ return new_range\n+\n+ istart = min(self.start, self.end)\n+ iend = max(self.start, self.end)\n+ jstart = min(o.start, o.end)\n+ jend = max(o.start, o.end)\n+ if istart < jstart:\n+ if iend <= jend:\n+ if self.isOnDirectStrand():\n+ self.start = istart\n+ self.end = jstart - 1\n+ else:\n+ self.start = jstart - 1\n+ self.end = istart\n+ else:\n+ if self.isOnDirectStrand():\n+ self.start = istart\n+ self.end = jstart - 1\n+ new_range.start = jend + 1\n+ new_range.end = iend\n+ else:\n+ self.start = jstart - 1;\n+ self.end = istart;\n+ new_range.start = iend\n+ new_range.end = jend + 1\n+ else: #istart>=jstart\n+ if iend <= jend:\n+ self.start = 0\n+ self.end = 0\n+ else:\n+ if self.isOnDirectStrand():\n+ self.start = jend + 1\n+ self.end = iend\n+ else:\n+ self.start = iend\n+ self.end = jend + 1\n+ return new_range\n+ \n+ ## Find the bin that contains the instance and compute its index\n+ #\n+ # @note Required for coordinate indexing via a hierarchical bin system\n+ #\n+ def findIdx(self):\n+ min_lvl = 3\n+ max_lvl = 6\n+ for bin_lvl in xrange(min_lvl, max_lvl):\n+ if getBin(self.start, bin_lvl) == getBin(self.end, bin_lvl):\n+ return getIdx(self.start, bin_lvl)\n+ return getIdx(self.start, max_lvl) \n+ \n+ ## Get a bin for fast database access\n+ #\n+ # @return bin number (float)\n+ #\n+ def getBin(self):\n+ for i in xrange(3, 8):\n+ bin_lvl = pow(10, i)\n+ if int(self.start/bin_lvl) == int(self.end/bin_lvl):\n+ return float(bin_lvl+(int(self.start/bin_lvl)/1e10))\n+ bin_lvl = pow(10, 8)\n+ return float(bin_lvl+(int(self.start/bin_lvl)/1e10))\n+ \n+ \n+# Functions\n+\n+# Get the bin number of a coordinate according to the bin level. Required for coordinate indexing with hierarchical bin system\n+# \n+def getBin(val, bin_lvl):\n+ bin_size = pow(10, bin_lvl)\n+ return long(val / bin_size)\n+ \n+# Get an index from a coordinate according to the bin level. Required for coordinate indexing with hierarchical bin system\n+#\n+def getIdx(val, bin_lvl):\n+ min_lvl = 3\n+ max_lvl = 6\n+ if bin_lvl >= max_lvl:\n+ return long((bin_lvl-min_lvl+1)*pow(10,max_lvl))\n+ return long(((bin_lvl-min_lvl+1)*pow(10,max_lvl))+getBin(val,bin_lvl))\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/Set.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/Set.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,125 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.coord.Map import Map + + +## Record a named region on a given sequence with an identifier +# +class Set( Map ): + + ## Constructor + # + # @param id identifier + # @param name the name of the region + # @param seqname the name of the sequence + # @param start the start coordinate + # @param end the end coordinate + # + def __init__(self, id=-1, name="", seqname="", start=-1, end=-1): + Map.__init__( self, name, seqname, start, end ) + self.id = id + + ## Equal operator + # + def __eq__(self, o): + if self.id != o.id: + return False + else: + return Map.__eq__(self, o) + + def getId(self): + return self.id + + ## Reset + # + def reset(self): + self.setFromTuple([-1, "", "", -1, -1 ]) + + ## Set attributes from tuple + # + # @param tuple: a tuple with (id, name, seqname, start, end) + # + def setFromTuple(self, tuple): + self.id = int(tuple[0]) + Map.setFromTuple(self, tuple[1:]) + + ## Return the attributes as a formatted string + # + def toString(self): + string = "%i" % (self.id) + string += "\t%s" % (Map.toString(self)) + return string + + ## Merge the instance with another Set instance + # + # @param o a Set instance + # + def merge(self, o): + if self.seqname == o.seqname: + Map.merge(self, o) + self.id = min(self.id, o.id) + + ## Return a Map instance with the attributes + # + def getMap(self): + return Map(self.name, self.seqname, self.start, self.end) + + ## Remove in the instance the region overlapping with another Set instance + # + # @param o a Set instance + # + def diff(self, o): + iMap = Map.diff(self, o.getMap()) + new = Set() + if not iMap.isEmpty(): + new.id = self.id + new.name = self.name + new.seqname = self.seqname + new.start = iMap.start + new.end = iMap.end + return new + + ## Return a Map instance with the identifier in the name + # + def set2map(self): + return Map(self.name+"::"+str(self.id),self.seqname,self.start,self.end) + + + def getMapInstance( self ): + iMap = Map() + lAttributes = [] + lAttributes.append( self.name ) + lAttributes.append( self.seqname ) + lAttributes.append( self.start ) + lAttributes.append( self.end ) + iMap.setFromTuple( lAttributes ) + return iMap |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/SetUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/SetUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,553 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+from commons.core.coord.Set import Set\n+\n+## Static methods for the manipulation of Path instances\n+#\n+class SetUtils( object ):\n+ \n+ ## Change the identifier of each Set instance in the given list\n+ #\n+ # @param lSets list of Set instances\n+ # @param newId new identifier\n+ #\n+ def changeIdInList(lSets, newId):\n+ for iSet in lSets:\n+ iSet.id = newId\n+ \n+ changeIdInList = staticmethod( changeIdInList )\n+ \n+ ## Return the length of the overlap between two lists of Set instances\n+ #\n+ # @param lSets1 list of Set instances\n+ # @param lSets2 list of Set instances\n+ # @return length of overlap\n+ # @warning sequence names are supposed to be identical\n+ #\n+ def getOverlapLengthBetweenLists(lSets1, lSets2):\n+ lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1)\n+ lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2)\n+ osize = 0\n+ i = 0\n+ j = 0\n+ while i!= len(lSet1Sorted):\n+ while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\\\n+ and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])):\n+ j+=1\n+ jj=j\n+ while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]):\n+ osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj])\n+ jj+=1\n+ i+=1\n+ return osize\n+ \n+ getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists )\n+ \n+ ## Return True if the two lists of Set instances overlap, False otherwise \n+ #\n+ # @param lSets1 list of Set instances\n+ # @param lSets2 list of Set instances\n+ # \n+ def areSetsOverlappingBetweenLists( lSets1, lSets2 ):\n+ lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1)\n+ lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2)\n+ i=0\n+ j=0\n+ while i!= len(lSet1Sorted):\n+ while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\\\n+ and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])):\n+ j+=1\n+ if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]):\n+ return True\n+ i+=1\n+ return False\n+ '..b' def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2):\n+ for i in lSet1:\n+ for idx,j in enumerate(lSet2):\n+ n=j.diff(i)\n+ if not n.isEmpty() and n.getLength()>=20:\n+ lSet2.append(n)\n+ lSet2WithoutOverlaps=[]\n+ for i in lSet2:\n+ if not i.isEmpty() and i.getLength()>=20:\n+ lSet2WithoutOverlaps.append(i)\n+ return lSet2WithoutOverlaps\n+ \n+ getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet)\n+\n+ ## Return a Set list from a Set file\n+ #\n+ # @param setFile string name of a Set file\n+ # @return a list of Set instances\n+ #\n+ def getSetListFromFile( setFile ):\n+ lSets = []\n+ setFileHandler = open( setFile, "r" )\n+ while True:\n+ line = setFileHandler.readline()\n+ if line == "":\n+ break\n+ iSet = Set()\n+ iSet.setFromString( line )\n+ lSets.append( iSet )\n+ setFileHandler.close()\n+ return lSets\n+ \n+ getSetListFromFile = staticmethod( getSetListFromFile )\n+ \n+ \n+ def convertSetFileIntoMapFile( setFile, mapFile ):\n+ setFileHandler = open( setFile, "r" )\n+ mapFileHandler = open( mapFile, "w" )\n+ iSet = Set()\n+ while True:\n+ line = setFileHandler.readline()\n+ if line == "":\n+ break\n+ iSet.setFromString( line )\n+ iMap = iSet.getMapInstance()\n+ iMap.write( mapFileHandler )\n+ setFileHandler.close()\n+ mapFileHandler.close()\n+ \n+ convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile )\n+\n+\n+ def getDictOfListsWithSeqnameAsKey( lSets ):\n+ dSeqnamesToSetList = {}\n+ for iSet in lSets:\n+ if not dSeqnamesToSetList.has_key( iSet.seqname ):\n+ dSeqnamesToSetList[ iSet.seqname ] = []\n+ dSeqnamesToSetList[ iSet.seqname ].append( iSet )\n+ return dSeqnamesToSetList\n+ \n+ getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey )\n+ \n+ \n+ def filterOnLength( lSets, minLength=0, maxLength=10000000000 ):\n+ if minLength == 0 and maxLength == 0:\n+ return lSets\n+ lFiltered = []\n+ for iSet in lSets:\n+ if minLength <= iSet.getLength() <= maxLength:\n+ lFiltered.append( iSet )\n+ return lFiltered\n+ \n+ filterOnLength = staticmethod( filterOnLength )\n+ \n+ \n+ def getListOfNames( setFile ):\n+ lNames = []\n+ setFileHandler = open( setFile, "r" )\n+ iSet = Set()\n+ while True:\n+ line = setFileHandler.readline()\n+ if line == "":\n+ break\n+ iSet.setFromTuple( line[:-1].split("\\t") )\n+ if iSet.name not in lNames:\n+ lNames.append( iSet.name )\n+ setFileHandler.close()\n+ return lNames\n+ \n+ getListOfNames = staticmethod( getListOfNames )\n+\n+\n+ def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ):\n+ dNames2DictsId = {}\n+ setFileHandler = open( setFile, "r" )\n+ while True:\n+ line = setFileHandler.readline()\n+ if line == "":\n+ break\n+ iSet = Set()\n+ iSet.setFromTuple( line[:-1].split("\\t") )\n+ if not dNames2DictsId.has_key( iSet.name ):\n+ dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] }\n+ else:\n+ if not dNames2DictsId[ iSet.name ].has_key( iSet.id ):\n+ dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ]\n+ else:\n+ dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet )\n+ setFileHandler.close()\n+ return dNames2DictsId\n+ \n+ getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile )\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/SlidingWindow.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/SlidingWindow.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,73 @@ +class SlidingWindow(object): + + def __init__( self, length = 1, overlap = 1 ): + self._length = length + self._overlap = overlap + self._start = 1 + self._end = length + self._step = length - overlap + + def slideWindowOnce(self): + self._start = self._start + self._step + self._end = self._end + self._step + + def getStart(self): + return self._start + + def getEnd(self): + return self._end + + def setStart(self, start): + self._start = start + + def setEnd(self, end): + self._end = end + + def getLength(self): + return self._length + + def getOverlap(self): + return self._overlap + + def setLength(self, length): + self._length = length + + def setOverlap(self, overlap): + self._overlap = overlap + + def getSlidingMsg(self): + return "Window is sliding : %s %s" %(self._start, self._end) + +class SlidingWindowToCountMatchingBases(SlidingWindow): + + def getSetLengthOnWindow( self, iSet ): + if self._isSetIncludedInTheWindow(iSet): + return iSet.getLength() + if self._isWindowIncludedInTheSet(iSet): + return self._length + elif self._isSetOverlapTheRightSideOfTheWindow(iSet): + return self._end - iSet.getMin()+1 + elif self._isSetOverlapTheLeftSideOfTheWindow(iSet): + return iSet.getMax() - self._start+1 + + def getCoordSetOnWindow( self, iSet ): + if self._isSetIncludedInTheWindow(iSet): + return iSet.getStart(), iSet.getEnd() + if self._isWindowIncludedInTheSet(iSet): + return self.getStart(), self.getEnd() + elif self._isSetOverlapTheRightSideOfTheWindow(iSet): + return iSet.getStart(), self.getEnd() + elif self._isSetOverlapTheLeftSideOfTheWindow(iSet): + return self.getStart(), iSet.getEnd() + + def _isSetIncludedInTheWindow(self, feature): + return feature.getMin() >= self._start and feature.getMax() <= self._end + + def _isWindowIncludedInTheSet(self, feature): + return self._start >= feature.getMin() and self._end <= feature.getMax() + + def _isSetOverlapTheRightSideOfTheWindow(self, feature): + return feature.getMin() <= self._end and feature.getMin() >= self._start + + def _isSetOverlapTheLeftSideOfTheWindow(self, feature): + return feature.getMax() <= self._end and feature.getMax() >= self._start |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/coord/align2set.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/coord/align2set.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,86 @@ +#!/usr/bin/env python + +import sys +import getopt +from commons.core.coord.Align import Align + +def help(): + print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) + print "options:" + print " -h: this help" + print " -i: input file name (format='align')" + print " -o: output file name (format='set', default=inFileName+'.set')" + print " -v: verbosity level (default=0/1)" + + +def align2set( inFileName, outFileName ): + alignFileHandler = open( inFileName, "r" ) + setFileHandler = open( outFileName, "w" ) + iAlign = Align() + countAlign = 0 + while True: + line = alignFileHandler.readline() + if line == "": + break + countAlign += 1 + iAlign.setFromString( line, "\t" ) + setFileHandler.write( "%i\t%s\t%s\t%i\t%i\n" % ( countAlign, + iAlign.getSubjectName(), + iAlign.getQueryName(), + iAlign.getQueryStart(), + iAlign.getQueryEnd() ) ) + alignFileHandler.close() + setFileHandler.close() + + +def main(): + + inFileName = "" + outFileName = "" + verbose = 0 + + try: + opts, args = getopt.getopt( sys.argv[1:], "hi:o:v:" ) + except getopt.GetoptError, err: + print str(err) + help() + sys.exit(1) + for o,a in opts: + if o == "-h": + help() + sys.exit(0) + elif o == "-i": + inFileName = a + elif o == "-o": + outFileName = a + elif o == "-v": + verbose = int(a) + + if inFileName == "": + print "ERROR: missing input file name" + help() + sys.exit(1) + + if verbose > 0: + print "START %s" % ( sys.argv[0].split("/")[-1] ) + sys.stdout.flush() + + if outFileName == "": + outFileName = "%s.set" % ( inFileName ) + +#TODO: move 'align2set' into 'AlignUtils.convertAlignFileIntoPSetFile' with a test +# AlignUtils.convertAlignFileIntoPSetFile( inFileName, outFileName ) + + align2set( inFileName, outFileName ) + + if verbose > 0: + print "END %s" % ( sys.argv[0].split("/")[-1] ) + sys.stdout.flush() + + return 0 + + +if __name__ == "__main__": + main() |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/.BamParser.py.swp |
b |
Binary file commons/core/parsing/.BamParser.py.swp has changed |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/AxtParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/AxtParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,140 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.misc.Utils import getHammingDistance + + +class AxtParser(MapperParser): + """A class that parses AXT (as given by Mosaik)""" + + def __init__(self, fileName, verbosity = 0): + super(AxtParser, self).__init__(fileName, verbosity) + self.queryLine = None + self.subjectLine = None + + def __del__(self): + super(AxtParser, self).__del__() + + + def getFileFormats(): + return ["axt"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + cpt = 0 + self.reset() + for line in self.handle: + line = line.strip() + if line == "": continue + if cpt % 3 == 0: + line = line.strip() + parts = line.split(" ") + self.chromosomes.add(parts[1]) + self.size += int(parts[6]) + self.nbMappings += 1 + cpt += 1 + if self.verbosity >= 10 and self.nbMappings % 100000 == 0: + sys.stdout.write(" %d mappings read\r" % (self.nbMappings)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d mappings read" % (self.nbMappings) + print "Done." + + + def parseLine(self, line): + + if line.strip() == "": + for line in self.handle: + self.currentLineNb += 1 + break + if line.strip() == "": + return None + + m = re.search(r"^\s*\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s*$", line) + if m != None: + mapping = Mapping() + subMapping = SubMapping() + + subMapping.queryInterval.setName(m.group(4)) + subMapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6)))) + subMapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6)))) + subMapping.queryInterval.setDirection(m.group(7)) + + subMapping.targetInterval.setChromosome(m.group(1)) + subMapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) + subMapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) + subMapping.targetInterval.setDirection(1) + + subMapping.setSize(min(subMapping.targetInterval.getSize(), subMapping.queryInterval.getSize())) + subMapping.setDirection(m.group(7)) + + mapping.addSubMapping(subMapping) + + mapping.setDirection(m.group(7)) + mapping.targetInterval.setChromosome(m.group(1)) + mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) + mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) + + mapping.queryInterval.setName(m.group(4)) + mapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6)))) + mapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6)))) + + mapping.setSize(min(mapping.targetInterval.getSize(), mapping.queryInterval.getSize())) + + self.currentMapping = mapping + return None + if self.queryLine == None: + self.queryLine = line + return None + self.subjectLine = line + seqLen = float(len(self.subjectLine)) + dist = float(getHammingDistance(self.queryLine, self.subjectLine)) + self.currentMapping.setNbMismatches(getHammingDistance(self.queryLine, self.subjectLine)) + self.currentMapping.setNbGaps(0) + self.queryLine = None + self.subjectLine = None + return self.currentMapping + + + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BamParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BamParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,483 @@\n+#\n+# Copyright INRA-URGI 2009-2012\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import re, sys, gzip, struct\n+from commons.core.parsing.MapperParser import MapperParser\n+from SMART.Java.Python.structure.Mapping import Mapping\n+from SMART.Java.Python.structure.SubMapping import SubMapping\n+from SMART.Java.Python.structure.Interval import Interval\n+\n+\n+BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"\n+\n+BAM_CIGAR_LOOKUP = "MIDNSHP=X"\n+BAM_CIGAR_SHIFT = 4\n+BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1)\n+\n+\n+\n+def pack_int32(x):\n+\treturn struct.pack(\'<i\', x)\n+\n+def pack_uint32(x):\n+\treturn struct.pack(\'<I\', x)\n+\n+def unpack_int8(x):\n+\treturn struct.unpack(\'<b\', x)[0]\n+\n+def unpack_int16(x):\n+\treturn struct.unpack(\'<h\', x)[0]\n+\n+def unpack_int32(x):\n+\treturn struct.unpack(\'<i\', x)[0]\n+\n+def unpack_int64(x):\n+\treturn struct.unpack(\'<q\', x)[0]\n+\n+def unpack_uint8(x):\n+\treturn struct.unpack(\'<B\', x)[0]\n+\n+def unpack_uint16(x):\n+\treturn struct.unpack(\'<H\', x)[0]\n+\n+def unpack_uint32(x):\n+\treturn struct.unpack(\'<I\', x)[0]\n+\n+def unpack_uint64(x):\n+\treturn struct.unpack(\'<Q\', x)[0]\n+\n+def unpack_float(x):\n+\treturn struct.unpack(\'<f\', x)[0]\n+\n+def unpack_string(x):\n+\tlength = len(x)\n+\tformat_string = "<{0}s".format(length)\n+\tstring = struct.unpack(format_string, x)[0]\n+\tif string[-1] == \'\\0\':\n+\t\treturn string[:-1]\n+\telse:\n+\t\treturn string\n+\n+\n+BAM_TAG_CODE = {"c": unpack_int8, \\\n+\t\t\t\t"C": unpack_uint8, \\\n+\t\t\t\t"s": unpack_int16, \\\n+\t\t\t\t"S": unpack_uint16, \\\n+\t\t\t\t"i": unpack_int32, \\\n+\t\t\t\t"I": unpack_uint32, \\\n+\t\t\t\t"f": unpack_float, \\\n+\t\t\t\t#"A": unpack_int8, \\\n+\t\t\t\t"A": lambda x: x, \\\n+\t\t\t\t"Z": unpack_int8, \\\n+\t\t\t\t"H": unpack_int8}\n+\n+BAM_TAG_VALUE = {"c": int, \\\n+\t\t\t\t "C": int, \\\n+\t\t\t\t "s": int, \\\n+\t\t\t\t "S": int, \\\n+\t\t\t\t "i": int, \\\n+\t\t\t\t "I": int, \\\n+\t\t\t\t "f": float, \\\n+\t\t\t\t "A": lambda x: x}\n+\n+BAM_TAG_SIZE = {"c": 1, \\\n+\t\t\t\t"C": 1, \\\n+\t\t\t\t"s": 2, \\\n+\t\t\t\t"S": 2, \\\n+\t\t\t\t"i": 4, \\\n+\t\t\t\t"I": 4, \\\n+\t\t\t\t"f": 4, \\\n+\t\t\t\t"A": 1}\n+\n+\n+class CigarOp(object):\n+\tdef __init__(self, data):\n+\t\tself._length = data >> BAM_CIGAR_SHIFT\n+\t\tself._type = BAM_CIGAR_LOOKUP[ data & BAM_CIGAR_MASK ]\n+\n+\n+class CigarData(object):\n+\tdef __init__(self, data, num_ops):\n+\t\tself._ops = []\n+\t\tfor i in range(num_ops):\n+\t\t\tcigar_data = unpack_uint32(data[i*4: (i+1)*4])\n+\t\t\tself._ops.append(CigarOp(cigar_data))\t\t\n+\n+\tdef getCigarData(self):\n+\t\treturn self._ops\n+\t\n+\tdef __str__(self):\n+\t\treturn "".join(["%d%s" % (op._length, op._type) for op in self._ops])\n+\n+\n+class TagsData(object):\n+\tdef __init__(self):\n+\t\tself._tags = {}\n+\n+\tdef add(self, tag):\n+\t\tself._tags[tag._ta'..b'nbGaps\t\t = 0\n+\tsubMapping\t = None\n+\tqueryOffset = 0\n+\ttargetOffset = 0\n+\treadStart\t = None\n+\n+\tfor tag, value in read._tags.iteritems():\n+\t\tif tag == "X0":\n+\t\t\tnbOccurrences = value._value\n+\t\telif tag == "X1":\n+\t\t\tnbOccurrences += value._value\n+\t\telif tag == "XM":\n+\t\t\tnbMismatches = value._value\n+\tmapping.setTagValue("nbOccurrences", nbOccurrences)\n+\tmapping.setTagValue("quality", read._mappingQuality)\n+\n+\tfor operation in read._cigar:\n+\t\tif operation._type == "M":\n+\t\t\tif readStart == None:\n+\t\t\t\treadStart = queryOffset\n+\t\t\tif subMapping == None:\n+\t\t\t\tsubMapping = SubMapping()\n+\t\t\t\tsubMapping.setSize(operation._length)\n+\t\t\t\tsubMapping.setDirection(direction)\n+\t\t\t\tsubMapping.queryInterval.setName(read._name)\n+\t\t\t\tsubMapping.queryInterval.setStart(queryOffset)\n+\t\t\t\tsubMapping.queryInterval.setDirection(direction)\n+\t\t\t\tsubMapping.targetInterval.setChromosome(read._chromosome)\n+\t\t\t\tsubMapping.targetInterval.setStart(genomeStart + targetOffset)\n+\t\t\t\tsubMapping.targetInterval.setDirection(1)\n+\t\t\tnbMatches\t += operation._length\n+\t\t\ttargetOffset += operation._length\n+\t\t\tqueryOffset += operation._length\n+\t\t\tcurrentNumber = 0\n+\t\t\tcontinue\n+\t\tif operation._type == "I":\n+\t\t\tnbGaps\t += 1\n+\t\t\tqueryOffset += operation._length\n+\t\t\tcurrentNumber = 0\n+\t\t\tcontinue\n+\t\tif operation._type == "D":\n+\t\t\tif subMapping != None:\n+\t\t\t\tsubMapping.queryInterval.setEnd(queryOffset - 1)\n+\t\t\t\tsubMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+\t\t\t\tmapping.addSubMapping(subMapping)\n+\t\t\tsubMapping\t = None\n+\t\t\tnbGaps\t += 1\n+\t\t\ttargetOffset += operation._length\n+\t\t\tcurrentNumber = 0\n+\t\t\tcontinue\n+\t\tif operation._type == "N":\n+\t\t\tif subMapping != None:\n+\t\t\t\tsubMapping.queryInterval.setEnd(queryOffset - 1)\n+\t\t\t\tsubMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+\t\t\t\tmapping.addSubMapping(subMapping)\n+\t\t\tsubMapping\t= None\n+\t\t\ttargetOffset += operation._length\n+\t\t\tcurrentNumber = 0\n+\t\t\tcontinue\n+\t\tif operation._type == "S":\n+\t\t\tnbMismatches += operation._length\n+\t\t\ttargetOffset += operation._length\n+\t\t\tqueryOffset += operation._length\n+\t\t\tcurrentNumber = 0\n+\t\t\tcontinue\n+\t\tif operation._type == "H":\n+\t\t\ttargetOffset += operation._length\n+\t\t\tqueryOffset += operation._length\n+\t\t\tcurrentNumber = 0\n+\t\t\tcontinue\n+\t\tif operation._type == "P":\n+\t\t\tcontinue\n+\t\traise Exception("Do not understand parameter \'%s\'" % (operation._type))\n+\n+\tif subMapping != None:\n+\t\tsubMapping.queryInterval.setEnd(queryOffset - 1)\n+\t\tsubMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+\t\tmapping.addSubMapping(subMapping)\n+\tmapping.queryInterval.setStart(readStart)\n+\tmapping.queryInterval.setEnd(queryOffset - 1)\n+\tmapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+\tmapping.setNbMismatches(nbMismatches)\n+\tmapping.setNbGaps(nbGaps)\n+\tmapping.queryInterval.setName(read._name)\n+\tmapping.queryInterval.setDirection(direction)\n+\tmapping.targetInterval.setChromosome(read._chromosome)\n+\tmapping.targetInterval.setStart(genomeStart)\n+\tmapping.targetInterval.setDirection(direction)\n+\tmapping.setSize(len(read._sequence))\n+\tmapping.setDirection(direction)\n+\treturn mapping\n+\n+\t\n+class BamParser(MapperParser):\n+\t"""A class that parses BAM format"""\n+\n+\tdef __init__(self, fileName, verbosity = 0):\n+\t\tself.verbosity = verbosity\n+\t\tself.handle = gzip.open(fileName, "rb")\n+\t\tself.reader = FileReader(self.handle)\n+\t\tself.nbMappings = None\n+\t\tself.fileName = fileName\n+\n+\n+\tdef __del__(self):\n+\t\tself.handle.close()\n+\n+\n+\tdef getFileFormats():\n+\t\treturn ["bam"]\n+\tgetFileFormats = staticmethod(getFileFormats)\n+\n+\n+\tdef reset(self):\n+\t\tself.reader.reset()\n+\n+\n+\tdef getNextMapping(self):\n+\t\tself.currentMapping = None\n+\t\twhile self.currentMapping == None:\n+\t\t\tread = self.reader.getNextAlignment()\n+\t\t\tif not read:\n+\t\t\t\tself.currentMapping = False\n+\t\t\t\treturn False\n+\t\t\tread.parse()\n+\t\t\tself.currentMapping = parseAlignedRead(read)\n+\t\treturn self.currentMapping\n+\t\t\n+\t\t\n+\tdef setDefaultTagValue(self, name, value):\n+\t\tpass\n+\n+\n+\tdef skipFirstLines(self):\n+\t\tpass\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BedParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BedParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,139 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript + + +class BedParser(TranscriptListParser): + """A class that parses a BED file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + self.title = None + TranscriptListParser.__init__(self, fileName, verbosity) + + +# def __del__(self): +# super(BedParser, self).__del__() + + + def getFileFormats(): + return ["bed"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + mark = self.handle.tell() + line = self.handle.readline() + line = line.strip() + m = re.search(r"^\s*track\s+name\s*=\s*(\S+)\s+", line) + if m != None: + self.title = m.group(1) + self.currentLineNb += 1 + else: + self.handle.seek(mark) + return + + + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\s*$", line) + if m != None: + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName("Unnamed") + transcript.setDirection(1) + return transcript + + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\s*$", line) + if m != None: + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName(m.group(4)) + transcript.setDirection(1) + return transcript + + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+\d+\.?\d*\s*$", line) + if m != None: + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName(m.group(4)) + transcript.setDirection(1) + return transcript + + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+\d+\t+([+-])\t+\d+\t+\d+\t+0\t+(\d+)\t+(\S+)\t+(\S+)\s*$", line) + if m == None: + raise Exception("\nLine %d '%s' does not has a BED format." % (self.currentLineNb, line)) + transcript = Transcript() + transcript.setChromosome(m.group(1)) + transcript.setStart(min(int(m.group(2)), int(m.group(3))-1)) + transcript.setEnd(max(int(m.group(2)), int(m.group(3))-1)) + transcript.setName(m.group(4)) + transcript.setDirection(m.group(5)) + nbExons = int(m.group(6)) + sizes = m.group(7).split(",") + starts = m.group(8).split(",") + + # check for comment in name + m = re.search(r"^([^\(]*)\((\S+)\)$", transcript.getName()) + if m != None: + transcript.setName(m.group(1)) + transcript.setTagValues(m.group(2), ";", "=") + + # check for nb occurrences in name + m = re.search(r"(.*)-(\d+)$", transcript.getName()) + if m != None: + transcript.setName(m.group(1)) + transcript.setOccurrence(int(m.group(2))) + + for i in range(nbExons): + exon = Interval(transcript) + exon.setStart(int(starts[i])+transcript.getStart()) + exon.setEnd(transcript.getStart()+int(starts[i])+int(sizes[i])-1) + exon.setSize(int(sizes[i])) + transcript.addExon(exon) + + if transcript.exons[0].getStart() != transcript.getStart(): + sys.exit("There is something wrong with the start of transcript line '%s': transcript starts at %d whereas first exon starts at %d" % (line.strip(), transcript.start, transcript.exons[0].start)) + if transcript.exons[-1].getEnd() != transcript.getEnd(): + sys.exit("There is something wrong with the end of transcript line '%s': transcript ends at %d whereas last exon ends at %d" % (line.strip(), transcript.end, transcript.exons[-1].end)) + + return transcript + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BlastParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlastParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,88 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Mapping import Mapping + + +class BlastParser(MapperParser): + """A class that parses the output of Blast (-m 8 format)""" + + def __init__(self, fileName, verbosity = 0): + super(BlastParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(BlastParser, self).__del__() + + + def getFileFormats(): + return ["blast"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^(\S+)\s+(\S+)\s+(\d+\.?\d*)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([-+]?\d+\.?\d*[eE]?[-+]?\d*)\s+(\d+\.?\d*)\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have an Blast format" % (self.currentLineNb, line)) + + mapping = Mapping() + + queryInterval = Interval() + queryInterval.setName(m.group(1)) + queryInterval.setStart(min(int(m.group(7)), int(m.group(8)))) + queryInterval.setEnd(max(int(m.group(7)), int(m.group(8)))) + + targetInterval = Interval() + targetInterval.setChromosome(m.group(2)) + targetInterval.setStart(min(int(m.group(9)), int(m.group(10)))) + targetInterval.setEnd(max(int(m.group(9)), int(m.group(10)))) + + subMapping = SubMapping() + subMapping.setQueryInterval(queryInterval) + subMapping.setTargetInterval(targetInterval) + + mapping.addSubMapping(subMapping) + + mapping.setIdentity(round(float(m.group(3)))) + mapping.setSize(int(m.group(4))) + mapping.setNbMismatches(int(m.group(5))) + mapping.setNbGaps(int(m.group(6))) + mapping.setDirection((int(m.group(8)) - int(m.group(7))) * (int(m.group(10)) - int(m.group(9)))) + mapping.setEvalue(float(m.group(11))) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BlatFileParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatFileParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,63 @@ +from commons.core.parsing.BlatParser import BlatParser +import os + +class BlatFileParser(object): + + def __init__(self, blatFileName = None): + self._blatFileName = blatFileName + self._lBlatHits = [] + self._dBlatHitsByQueries = {} + self._dQueries = {} + + def getDictOfQueries(self): + return self._dQueries + + def getResultLinesOfOneQuery(self, queryName): + return self._dBlatHitsByQueries[queryName] + + def getDictOfBlatHitsByQueries(self): + return self._dBlatHitsByQueries + + def getListsOfHits(self): + return self._lBlatHits + + def parseBlatFile(self): + blatFile = open(self._blatFileName, 'r') + line = blatFile.readline() + n = 1 + while line != "": + if self._isInteger(line.split("\t")[0]): + iBlatParser = BlatParser() + iBlatParser.setAttributesFromString(line, n) + queryHeader = iBlatParser.getQName() + self._dQueries[queryHeader] = 1 + self._lBlatHits.append(iBlatParser) + line = blatFile.readline() + n += 1 + return self._lBlatHits + + def parseBlatFileByQueries(self): + blatFile = open(self._blatFileName, 'r') + line = blatFile.readline() + n = 1 + while line != "": + if self._isInteger(line.split("\t")[0]): + iBlatParser = BlatParser() + iBlatParser.setAttributesFromString(line, n) + queryHeader = iBlatParser.getQName() + self._dQueries[queryHeader] = 1 + if self._dBlatHitsByQueries.has_key(queryHeader): + self._dBlatHitsByQueries[queryHeader].append(iBlatParser) + else: + self._dBlatHitsByQueries[queryHeader] = [iBlatParser] + line = blatFile.readline() + n += 1 + blatFile.close() + return self._dBlatHitsByQueries + + def _isInteger(self, string): + try: + int(string) + return True + except ValueError: + return False |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BlatParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,351 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+import sys\n+\n+## this class can parse a Blat results output file\n+#\n+class BlatParser(object):\n+\n+\n+ def __init__(self, match=\'\', mismatch=\'\', repMatch=\'\', N=\'\', QGapCount=\'\', QGapBases=\'\', TGapCount=\'\', TGapBases=\'\', strand=\'\', QName=\'\', QSize=\'\', QStart=\'\', QEnd=\'\', TName=\'\', TSize=\'\', TStart=\'\', TEnd=\'\', blockCount=\'\', blockSizes=\'\', qStarts=\'\', tStarts=\'\'):\n+ self._match = match\n+ self._mismatch = mismatch\n+ self._repMatch = repMatch\n+ self._N = N\n+ self._QGapCount = QGapCount\n+ self._QGapBases = QGapBases\n+ self._TGapCount = TGapCount\n+ self._TGapBases = TGapBases\n+ self._strand = strand\n+ self._QName = QName\n+ self._QSize = QSize\n+ self._QStart = QStart\n+ self._QEnd = QEnd\n+ self._TName = TName\n+ self._TSize = TSize\n+ self._TStart = TStart\n+ self._TEnd = TEnd\n+ self._blockCount = blockCount\n+ self._blockSizes = blockSizes\n+ self._qStarts = qStarts\n+ self._tStarts = tStarts\n+ \n+ def __eq__(self, o):\n+ return self._TName == o._TName and self._TSize == o._TSize and self._TStart == o._TStart and self._TEnd == o._TEnd\n+ \n+ def setMatch(self, match):\n+ self._match = match\n+ \n+ def setMismatch(self, mismatch):\n+ self._mismatch = mismatch\n+ \n+ def setRepMatch(self, repMatch):\n+ self._repMatch = repMatch\n+ \n+ def setN(self, N):\n+ self._N = N\n+ \n+ def setQGapCount(self, QGapCount):\n+ self._QGapCount = QGapCount\n+ \n+ def setQGapBases(self, QGapBases):\n+ self._QGapBases = QGapBases\n+ \n+ def setTGapCount(self, TGapCount):\n+ self._TGapCount = TGapCount\n+ \n+ def setTGapBases(self, TGapBases):\n+ self._TGapBases = TGapBases\n+ \n+ def setStrand(self, strand):\n+ self._strand = strand\n+ \n+ def setQName(self, QName):\n+ self._QName = QName\n+ \n+ def setQSize(self, QSize):\n+ self._QSize = QSize\n+ \n+ def setQStart(self, QStart):\n+ self._QStart = QStart\n+ \n+ def setQEnd(self, QEnd):\n+ self._QEnd = QEnd\n+ \n+ def setTName(self, TName):\n+ self._TName = TName\n+ \n+ def setTSize(self, TSize):\n+ self._TSize = TSize\n+ \n+ def setTStart(self'..b'e:\n+ sys.stderr.write("WARNING: The field QName is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[10] != \'\':\n+ self.setQSize(lResults[10])\n+ else:\n+ sys.stderr.write("WARNING: The field QSize is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[11] != \'\':\n+ self.setQStart(lResults[11])\n+ else:\n+ sys.stderr.write("WARNING: The field QStart is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[12] != \'\':\n+ self.setQEnd(lResults[12])\n+ else:\n+ sys.stderr.write("WARNING: The field QEnd is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[13] != \'\':\n+ self.setTName(lResults[13])\n+ else:\n+ sys.stderr.write("WARNING: The field TName is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[14] != \'\':\n+ self.setTSize(lResults[14])\n+ else:\n+ sys.stderr.write("WARNING: The field TSize is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[15] != \'\':\n+ self.setTStart(lResults[15])\n+ else:\n+ sys.stderr.write("WARNING: The field TStart is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[16] != \'\':\n+ self.setTEnd(lResults[16])\n+ else:\n+ sys.stderr.write("WARNING: The field TEnd is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[17] != \'\':\n+ self.setBlockCount(lResults[17])\n+ else:\n+ sys.stderr.write("WARNING: The field BlockCount is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[18] != \'\':\n+ self.setBlockSizes(lResults[18])\n+ else:\n+ sys.stderr.write("WARNING: The field BlockSizes is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[19] != \'\':\n+ self.setQStarts(lResults[19])\n+ else:\n+ sys.stderr.write("WARNING: The field QStarts is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if lResults[20] != \'\':\n+ self.setTStarts(lResults[20])\n+ else:\n+ sys.stderr.write("WARNING: The field TStarts is empty in blat file in line %s\\n" % iCurrentLineNumber)\n+ error = True\n+ \n+ if error == True:\n+ self._setAllToNull()\n+ \n+ def setAttributesFromString(self, blatLine, iCurrentLineNumber ="", fieldSeparator ="\\t"):\n+ blatLine = blatLine.rstrip()\n+ lBlatLineItem = blatLine.split(fieldSeparator)\n+ if not len(lBlatLineItem) == 21:\n+ sys.stderr.write("WARNING: The line %s is not valid blat line (%s columns -> 21 columns needed)\\n" % (iCurrentLineNumber, len(lBlatLineItem)))\n+ else:\n+ self.setAttributes(lBlatLineItem, iCurrentLineNumber)\n+ \n+ def _setAllToNull(self):\n+ self._match = \'\'\n+ self._mismatch = \'\'\n+ self._repMatch = \'\'\n+ self._N = \'\'\n+ self._QGapCount = \'\'\n+ self._QGapBases = \'\'\n+ self._TGapCount = \'\'\n+ self._TGapBases = \'\'\n+ self._strand = \'\'\n+ self._QName = \'\'\n+ self._QSize = \'\'\n+ self._QStart = \'\'\n+ self._QEnd = \'\'\n+ self._TName = \'\'\n+ self._TSize = \'\'\n+ self._TStart = \'\'\n+ self._TEnd = \'\'\n+ self._blockCount = \'\'\n+ self._blockSizes = \'\'\n+ self._qStarts = \'\'\n+ self._tStarts = \'\'\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BlatToGff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatToGff.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,116 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import optparse +import os +from commons.core.parsing.BlatParser import BlatParser + +class BlatToGff(object): + + + def __init__(self): + pass + + def setAttributesFromCmdLine(self): + help = '\ + \nThis Script Launch BlatToGff.\n\n\ + Example 1: python BlatToGff.py -i blatResultsFile.tab -o outputFile.gff3\n\n' + parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0") + parser.add_option( '-i', '--input', dest='inputBLAT', help='Blat Input File Name [Format: tabular]', default= None ) + parser.add_option( '-o', '--output', dest='output', help='Output File Name [Format: GFF3]', default= None ) + parser.add_option( '-n', '--methodname', dest='methodName', help='Method name in col. 3 [Default: None]', default= None ) + ( options, args ) = parser.parse_args() + self._options = options + + def checkOptions(self): + if self._options.inputBLAT == '': + raise Exception("ERROR: No Blat file specified for -i !") + elif not os.path.exists(self._options.inputBLAT): + raise Exception("ERROR: Blat Input File doesn't exist !") + else: + self._inputFileBlat = self._options.inputBLAT + + if self._options.output == '': + raise Exception("ERROR: No Output file specified for -o !") + else: + self._outputFileGFF = self._options.output + + self._methodName = self._options.methodName + + def run(self): + self.checkOptions() + self._createGFFOutputFile() + BLATFile = open(self._inputFileBlat, 'r') + + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + headerBlatLine = BLATFile.readline() + blatLine = BLATFile.readline() + numberLine = 6 + while blatLine != '': + gffLine = self.convertBlatObjectToGffLine(blatLine, numberLine) + self._printGFFLinesToOutputFile(gffLine) + blatLine = BLATFile.readline() + numberLine = numberLine + 1 + + def convertBlatObjectToGffLine(self, blatLine, numberLine): + iBlatHit = BlatParser() + iBlatHit.setAttributesFromString(blatLine, numberLine) + col1 = iBlatHit.getTName() + col2 = 'BlatToGff' + if self._methodName == '' or self._methodName == None: + col3 = 'BES' + else: + col3 = '%s:BES' % self._methodName + col4 = iBlatHit.getTStart() + col5 = iBlatHit.getTEnd() + col6 = '.' + col7 = '+' + col8 = '.' + col9 = 'ID=%s;Name=%s;bes_start=%s;bes_end=%s;bes_size=%s' % (iBlatHit.getQName(), iBlatHit.getQName(), iBlatHit.getTStart(), iBlatHit.getTEnd(), iBlatHit.getTSize()) + gffLine = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (col1, col2, col3, col4, col5, col6, col7, col8, col9) + return gffLine + + def _createGFFOutputFile(self): + GFFfile = open(self._outputFileGFF, 'w') + GFFfile.write("##gff-version 3\n") + GFFfile.close() + + def _printGFFLinesToOutputFile(self, line): + GFFfile = open(self._outputFileGFF, 'a') + GFFfile.write(line) + GFFfile.close() + +if __name__ == '__main__': + iBlatToGff = BlatToGff() + iBlatToGff.setAttributesFromCmdLine() + iBlatToGff.run() \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BlatToGffForBesPaired.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BlatToGffForBesPaired.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,266 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+import optparse\n+import os\n+import sys\n+import re\n+import datetime\n+from commons.core.parsing.BlatParser import BlatParser\n+from commons.core.seq.FastaUtils import FastaUtils \n+\n+class BlatToGffForBesPaired(object):\n+\n+\n+ def __init__(self):\n+ pass\n+ \n+ def setAttributesFromCmdLine(self):\n+ help = \'\\\n+ \\nThis Script Launch BlatToGffForBesPaired.\\n\\n\\\n+ Example 1: python BlatToGffForBesPaired.py -i blatResultsFile.tab -f besSequences.fasta -o outputFile.gff3\\n\\\n+ Example 2: python BlatToGffForBesPaired.py -i blatResultsFile.tab -f besSequences.fasta -o outputFile.gff3 -n muscadine:filtre1\\n\\n\\\n+ Note 1: In blat input file, all BAC-Ends must be paired. In addition, they must be one above the other.\\nFor example, if you have the BES MRRE1H032F08FM1 (forward), we must have the BES MRRE1H032F08RM1 (reverse) just after, like:\\n\\\n+ 554\\t26\\t0\\t0\\t1\\t16\\t1\\t17\\t+\\tMRRE1H032F08FM1\\t606\\t10\\t606\\tchr11\\t19818926\\t3725876\\t3726473\\t2\\t553,27,\\t10,579,\\t3725876,3726446,\\n\\\n+ 620\\t23\\t0\\t0\\t0\\t0\\t0\\t0\\t-\\tMRRE1H032F08RM1\\t643\\t0\\t643\\tchr11\\t19818926\\t3794984\\t3795627\\t1\\t643,\\t0,\\t3794984,\\n\\\n+ Note 2: the header in Blat results output file must be present (5 lines).\\n\\n\'\n+ \n+ parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0")\n+ parser.add_option( \'-i\', \'--input\', dest=\'inputBLAT\', help=\'Blat Input File Name, with BES paired (1 Forward and 1 Reverse) [Format: tabular]\', default= None )\n+ parser.add_option( \'-f\', \'--fasta\', dest=\'inputFASTA\', help=\'Fasta Input File Name, with all sequences of BES [Format: fasta]\', default= None )\n+ parser.add_option( \'-o\', \'--output\', dest=\'output\', help=\'Output File Name [Format: GFF3]\', default= None )\n+ parser.add_option( \'-n\', \'--methodname\', dest=\'methodName\', help=\'Method name in col. 3 [Default: None]\', default= None )\n+ ( options, args ) = parser.parse_args()\n+ self._options = options\n+ \n+ def checkOptions(self):\n+ if self._options.inputBLAT == \'\':\n+ raise Exception("ERROR: No Blat file specified for -i !")\n+ elif not os.path.exists(self._options.inputBLAT):\n+ raise Exception("ERROR: Blat Input File doesn\'t exist !")\n+ else:\n+ self._inputFileBlat = self._options.inputBLAT\n+ '..b' col9 = \'ID=%s;Name=%s;bac_start=%s;bac_end=%s;bac_size=%s;besFM_name=%s;muscadine_besFM_seq=%s;besRM_name=%s;muscadine_besRM_seq=%s\' % (bacName, bacName, startBacPos, endBacPos, sizeBacPos, nameBesFM, seqBesFM, nameBesRM, seqBesRM)\n+ gffLine = \'%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n\' % (col1, col2, col3, col4, col5, col6, col7, col8, col9)\n+ return gffLine\n+ return None\n+ \n+ def getBesFmAndRmNamesAndSequences(self, besName1, seqBes1, typeBes1, besName2, seqBes2, typeBes2):\n+ if typeBes1 == \'FM\' and typeBes2 == \'RM\':\n+ return besName1, seqBes1, besName2, seqBes2\n+ elif typeBes1== \'RM\' and typeBes2 == \'FM\':\n+ return besName2, seqBes2, besName1, seqBes1\n+\n+ def getBesName(self, col9):\n+ lCol9 = col9.split(\';\')\n+ ID = lCol9[0]\n+ besName = ID[3:]\n+ return besName\n+ \n+ def getBacName(self, besName):\n+ bacName = besName[:-3]\n+ return bacName\n+\n+ def checkBesNames(self, besName1, besName2, line):\n+ bacName1 = besName1[:-3]\n+ bacName2 = besName2[:-3]\n+ if bacName1 == bacName2:\n+ return True\n+ else:\n+ sys.stderr.write("WARNING: Lines %s and %s the two Bes (%s AND %s) do not belong to the same BAC !!!\\n -> you have to filter this Blat file...\\n" % (int(line)-1, line, besName1, besName2))\n+ return False\n+ \n+ def checkBesPositions(self, tBes1, tBes2):\n+ if tBes1[0] == tBes2[0]:\n+ minBes1 = min(tBes1[1], tBes1[2])\n+ maxBes1 = max(tBes1[1], tBes1[2])\n+ minBes2 = min(tBes2[1], tBes2[2])\n+ maxBes2 = max(tBes2[1], tBes2[2])\n+ if (minBes1 < minBes2 and maxBes1 < minBes2) or (minBes2 < minBes1 and maxBes2 < minBes1):\n+ return True\n+ return False\n+ \n+ def getBacPositions(self, tBes1, tBes2):\n+ startBacPos = 0\n+ endBacPos = 0\n+ minBes1 = min(tBes1[1], tBes1[2])\n+ maxBes1 = max(tBes1[1], tBes1[2])\n+ minBes2 = min(tBes2[1], tBes2[2])\n+ maxBes2 = max(tBes2[1], tBes2[2])\n+ if minBes1 < minBes2:\n+ startBacPos = minBes1\n+ endBacPos = maxBes2\n+ else:\n+ startBacPos = minBes2\n+ endBacPos = maxBes1\n+ return startBacPos, endBacPos\n+ \n+ def extractBesSequenceFromFastaFile(self, besName, numberLine):\n+ seq = \'\'\n+ date = datetime.datetime.now()\n+ date = date.strftime("%d%m%Y_%H%M%S")\n+ tmpFileName = \'tmp_BlatToGffForBesPaired_%s.fasta\' % date\n+ iFastaUtils = FastaUtils()\n+ iFastaUtils.dbExtractByPattern(besName, self._inputFileFasta, tmpFileName)\n+ \n+ if os.path.exists(tmpFileName):\n+ newFastaFile = open(tmpFileName, \'r\')\n+ line = newFastaFile.readline()\n+ if line != \'\':\n+ while line != \'\':\n+ if line[0] != \'>\':\n+ line = line.replace(\'\\n\', \'\')\n+ seq += line\n+ line = newFastaFile.readline()\n+ newFastaFile.close()\n+ os.remove(tmpFileName)\n+ return seq\n+ os.remove(tmpFileName)\n+ \n+ sys.stderr.write("WARNING: At line %s, the BAC-Ends (%s) hasn\'t got sequence in fasta file (%s) !!\\n" % (numberLine, besName, os.path.basename(self._inputFileFasta)))\n+ return \'NA\'\n+ \n+ def _createGFFOutputFile(self):\n+ GFFfile = open(self._outputFileGFF, \'w\')\n+ GFFfile.write("##gff-version 3\\n")\n+ GFFfile.close()\n+ \n+ def _printGFFLinesToOutputFile(self, lLines):\n+ GFFfile = open(self._outputFileGFF, \'a\')\n+ for line in lLines:\n+ GFFfile.write(line)\n+ GFFfile.close()\n+\n+if __name__ == \'__main__\':\n+ iBlatToGffForBesPaired = BlatToGffForBesPaired()\n+ iBlatToGffForBesPaired.setAttributesFromCmdLine()\n+ iBlatToGffForBesPaired.run()\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/BowtieParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/BowtieParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,91 @@ +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Interval import Interval + +class BowtieParser(MapperParser): + """A class that parses BowTie format""" + + def __init__(self, fileName, verbosity = 0): + super(BowtieParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(BowtieParser, self).__del__() + + + def getFileFormats(): + return ["bowtie"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + line = line.strip() + fields = line.split("\t") + if len(fields) not in (7, 8): + raise Exception("Line %d '%s' does not look like a BowTie line (number of fields is %d instead of 7 or 8)" % (self.currentLineNb, line, len(fields))) + name = fields[0] + direction = 1 if fields[1] == "+" else -1 + chromosome = fields[2] + genomeStart = int(fields[3]) + 1 + sequence = fields[4] + quality = fields[5] + number = int(fields[6]) + nbMismatches = 0 + if len(fields) == 8: + tags = fields[7] + nbMismatches = len(tags.split(",")) + + mapping = Mapping() + queryInterval = Interval() + queryInterval.setName(name) + queryInterval.setStart(1) + queryInterval.setEnd(len(sequence) + 1) + targetInterval = Interval() + targetInterval.setChromosome(chromosome) + targetInterval.setStart(genomeStart) + targetInterval.setEnd(genomeStart + len(sequence) - 1) + subMapping = SubMapping() + subMapping.setQueryInterval(queryInterval) + subMapping.setTargetInterval(targetInterval) + mapping.addSubMapping(subMapping) + mapping.setSize(len(sequence)) + mapping.setNbMismatches(nbMismatches) + mapping.setDirection(direction) + return mapping + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/CoordsParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/CoordsParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,137 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.misc import Utils + +class CoordsParser(MapperParser): + """A class that parses the .coords output of Nucmer""" + + def __init__(self, fileName, verbosity = 0): + self._lineParseRe = re.compile(r"^\s*(?P<tStart>\d+)\s+(?P<tEnd>\d+)\s+\|\s+(?P<qStart>\d+)\s+(?P<qEnd>\d+)\s+\|\s+(?P<tLength>\d+)\s+(?P<qLength>\d+)\s+\|\s+(?P<identity>\d+\.?\d*)\s+\|\s+(?P<tName>[\w\|\:\-]+)\s+(?P<qName>.*)\s*$") + self._lineParseRe2 = re.compile(r"^\s*(?P<tStart>\d+)\s+(?P<tEnd>\d+)\s+(?P<qStart>\d+)\s+(?P<qEnd>\d+)\s+(?P<tLength>\d+)\s+(?P<qLength>\d+)\s+(?P<identity>\d+\.?\d*)\s+(?P<rlen>\d+\.?\d*)\s+(?P<qlen>\d+\.?\d*)\s+(?P<rcov>\d+\.?\d*)\s+(?P<qcov>\d+\.?\d*)\s+(?P<rframe>[-]?\d+\.?\d*)\s+(?P<qframe>[-]?\d+\.?\d*)\s+(?P<tName>[\w\|\:\-]+)\s+(?P<qName>.*)\s*$") + self._lineParseRe3 = re.compile(r"^\s*(?P<tStart>\d+)\s+(?P<tEnd>\d+)\s+\|\s+(?P<qStart>\d+)\s+(?P<qEnd>\d+)\s+\|\s+(?P<tLength>\d+)\s+(?P<qLength>\d+)\s+\|\s+(?P<identity>\d+\.?\d*)\s+(?P<sim>\d+\.?\d*)\s+(?P<stp>\d+\.?\d*)\s+\|\s+(?P<rframe>[-]?\d+\.?\d*)\s+(?P<qframe>[-]?\d+\.?\d*)\s+(?P<tName>[\w\|\:\-]+)\s+(?P<qName>.*)\s*$") + self._lineParseRe4 = re.compile(r"^\s*(?P<tStart>\d+)\s+(?P<tEnd>\d+)\s+(?P<qStart>\d+)\s+(?P<qEnd>\d+)\s+(?P<tLength>\d+)\s+(?P<qLength>\d+)\s+(?P<identity>\d+\.?\d*)\s+(?P<sim>\d+\.?\d*)\s+(?P<stp>\d+\.?\d*)\s+(?P<rlen>\d+\.?\d*)\s+(?P<qlen>\d+\.?\d*)\s+(?P<rcov>\d+\.?\d*)\s+(?P<qcov>\d+\.?\d*)\s+(?P<rframe>[-]?\d+\.?\d*)\s+(?P<qframe>[-]?\d+\.?\d*)\s+(?P<tName>[\w\|\:\-]+)\s+(?P<qName>.*)\s*$") + self.lineType = 1 + MapperParser.__init__(self, fileName, verbosity) + + def getFileFormats(): + return ["coords"] + getFileFormats = staticmethod(getFileFormats) + + def skipFirstLines(self): + while True: + line = self.handle.readline() + self.currentLineNb += 1 + if line == "": + break + if "=====" in line: + break + if "[S1]\t[E1]\t[S2]\t[E2]\t[LEN 1]\t[LEN 2]\t[% IDY]\t[LEN R]\t[LEN Q]\t[COV R]\t[COV Q]\t[FRM]\t[TAGS]" in line: + self.lineType = 2 + break + if "[S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] [% SIM] [% STP] | [FRM] [TAGS]" in line: + self.lineType = 3 + + if "[% IDY]\t[% SIM]\t[% STP]" in line and "[LEN Q]"in line: + self.lineType = 4 + break + + def parseLine(self, line): + + if self.lineType == 1 : + m = self._lineParseRe.search(line) + elif self.lineType == 2: + m = self._lineParseRe2.search(line) + elif self.lineType == 3: + m = self._lineParseRe3.search(line) + elif self.lineType == 4: + m = self._lineParseRe4.search(line) + if m == None: + sys.exit("\nLine %d '%s' does not have a NucMer format" % (self.currentLineNb, line)) + + mapping = Mapping() + + subMapping = SubMapping() + subMapping.queryInterval.setName(m.group("qName")) + subMapping.queryInterval.setStart(min(int(m.group("qStart")), int(m.group("qEnd")))) + subMapping.queryInterval.setEnd(max(int(m.group("qStart")), int(m.group("qEnd")))) + subMapping.queryInterval.setSize(int(m.group("qLength"))) + subMapping.queryInterval.setDirection(int(m.group("qEnd")) - int(m.group("qStart"))) + + subMapping.targetInterval.setChromosome(m.group("tName")) + subMapping.targetInterval.setStart(min(int(m.group("tStart")), int(m.group("tEnd")))) + subMapping.targetInterval.setEnd(max(int(m.group("tStart")), int(m.group("tEnd")))) + subMapping.targetInterval.setSize(int(m.group("tLength"))) + subMapping.targetInterval.setDirection(int(m.group("tEnd")) - int(m.group("tStart"))) + + subMapping.setDirection(int(m.group("qEnd")) - int(m.group("qStart"))) + subMapping.setSize(min(int(m.group("qLength")), int(m.group("tLength")))) + subMapping.setIdentity(float(m.group("identity"))) + + mapping.addSubMapping(subMapping) + mapping.targetInterval.setStart(min(int(m.group("tStart")), int(m.group("tEnd")))) + mapping.targetInterval.setEnd(max(int(m.group("tStart")), int(m.group("tEnd")))) + mapping.targetInterval.setSize(int(m.group("tLength"))) + mapping.targetInterval.setChromosome(m.group("tName")) + + mapping.queryInterval.setStart(min(int(m.group("qStart")), int(m.group("qEnd")))) + mapping.queryInterval.setEnd(max(int(m.group("qStart")), int(m.group("qEnd")))) + mapping.queryInterval.setSize(int(m.group("qLength"))) + mapping.queryInterval.setName(m.group("qName")) + mapping.setDirection(int(m.group("qEnd")) - int(m.group("qStart"))) + mapping.setSize(min(int(m.group("qLength")), int(m.group("tLength")))) + mapping.setIdentity(float(m.group("identity"))) + mapping.setTagValue("feature", "match") + mapping.setTagValue("Target", "%s %d %d" % (m.group("qName"), int(m.group("qStart")), int(m.group("qEnd")))) + + if self.lineType ==2 or self.lineType ==4: + mapping.setTagValue("target_pident", float(m.group("identity"))) + mapping.setTagValue("target_pcover", float(m.group("qcov"))) + mapping.setTagValue("target_length", int(m.group("qlen"))) + + +# Specific to Mark Work. Commented lines because of possible slowdown. +# for line in self.handle: +# string1 = line.strip() +# self.currentLineNb += 1 +# break +# for line in self.handle: +# string2 = line.strip() +# self.currentLineNb += 1 +# break +# print(len(string1),len(string2)) +# mapping.setNbMismatches(Utils.getHammingDistance(string1, string2)) + mapping.setNbGaps(0) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/CrossSsrAndBesMappedByBlatToGff.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,197 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import os\n+import optparse\n+from commons.core.parsing.SsrParser import SsrParser\n+from commons.core.parsing.BlatParser import BlatParser\n+\n+class CrossSsrAndBesMappedByBlatToGff(object):\n+\n+\n+ def __init__(self):\n+ self._inputFileSSR = \'\'\n+ self._inputFileBlat = \'\'\n+ self._outputFileGFF = \'\'\n+ \n+ def setAttributesFromCmdLine(self):\n+ help = \'\\\n+ \\nThis Script Launch CrossSsrAndBesMappedByBlatToGff.\\n\\n\\\n+ Example 1: python CrossSsrAndBesMappedByBlatToGff.py -s ssrResultsFile.tab -b blatResultsFile.tab -o outputFile.gff3\\n\\\n+ Example 2: python CrossSsrAndBesMappedByBlatToGff.py -s ssrResultsFile.tab -b blatResultsFile.tab -o outputFile.gff3 -n muscadine:filtre1\\n\\n\'\n+ \n+ parser = optparse.OptionParser(usage= help, version="CovertSamToFastq.py v1.0")\n+ parser.add_option( \'-s\', \'--ssr\', dest=\'inputSSR\', help=\'SSR Input File Name [Format: tabular]\', default= None )\n+ parser.add_option( \'-b\', \'--blat\', dest=\'inputBLAT\', help=\'Blat Input File Name [Format: tabular]\', default= None )\n+ parser.add_option( \'-o\', \'--output\', dest=\'output\', help=\'Output File Name [Format: GFF3]\', default= None )\n+ parser.add_option( \'-n\', \'--methodName\', dest=\'methodName\', help=\'Method name in col. 3 [Default: None]\', default= None )\n+ ( options, args ) = parser.parse_args()\n+ self.options = options\n+ \n+ def checkOptions(self):\n+ if self.options.inputSSR == \'\':\n+ raise Exception("ERROR: No SSR file specified for -s !")\n+ elif not os.path.exists(self.options.inputSSR):\n+ raise Exception("ERROR: SSR Input File doesn\'t exist !")\n+ else:\n+ self._inputFileSSR = self.options.inputSSR\n+ \n+ if self.options.inputBLAT == \'\':\n+ raise Exception("ERROR: No Blat file specified for -b !")\n+ elif not os.path.exists(self.options.inputBLAT):\n+ raise Exception("ERROR: Blat Input File doesn\'t exist !")\n+ else:\n+ self._inputFileBlat = self.options.inputBLAT\n+ \n+ if self.options.output == \'\':\n+ raise Exception("ERROR: No Output file specified for -o !")\n+ else:\n+ self._outputFileGFF = self.options.output\n+ \n+ self._methodName = self.options.methodName\n+ \n+ def run(self):\n+ '..b'\n+ besNameToKeep = BlatHitObject.getQName()\n+ lOfSSRHitObject = dictSsrParser[besNameToKeep]\n+ \n+ for SSRHitObject in lOfSSRHitObject:\n+ posSSRStart = self.convertSSRPositionsToChromPositions(SSRHitObject.getSsrStart(), BlatHitObject.getTStart(), BlatHitObject.getTEnd(), BlatHitObject.getStrand())\n+ posSSREnd = self.convertSSRPositionsToChromPositions(SSRHitObject.getSsrEnd(), BlatHitObject.getTStart(), BlatHitObject.getTEnd(), BlatHitObject.getStrand())\n+ ssrSeq = self.getSsrSeq(SSRHitObject.getSsrMotif(), SSRHitObject.getSsrMotifNumber())\n+ \n+ col1 = BlatHitObject.getTName()\n+ col2 = \'CrossSsrAndBesAlignedByBlat\'\n+ if self._methodName != \'\' and self._methodName != None:\n+ col3 = \'%s:SSR\' %self._methodName\n+ else:\n+ col3 = \'SSR\'\n+ col4 = posSSRStart\n+ col5 = posSSREnd\n+ col6 = \'.\'\n+ col7 = BlatHitObject.getStrand()\n+ col8 = \'.\'\n+ col9 = \'ID=SSR_%s_%s;Name=SSR_%s_%s;bes_name=%s;bes_size=%s;bes_matchstart=%s;bes_matchend=%s;bes_redundancy=%s;ssr_type=%s;ssr_motif=%s;ssr_motif_number=%s;ssr_start=%s;ssr_end=%s;muscadine_seq=%s\' % (besNameToKeep, SSRHitObject.getBesRedundancy(), \n+ besNameToKeep, SSRHitObject.getBesRedundancy(),\n+ besNameToKeep, BlatHitObject.getQSize(),\n+ BlatHitObject.getQStart(), BlatHitObject.getQEnd(), \n+ SSRHitObject.getBesRedundancy(), SSRHitObject.getSsrNbNucleotides(),\n+ SSRHitObject.getSsrMotif(), SSRHitObject.getSsrMotifNumber(),\n+ SSRHitObject.getSsrStart(), SSRHitObject.getSsrEnd(), ssrSeq)\n+ gffLine = \'%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n\' % (col1, col2, col3, col4, col5, col6, col7, col8, col9)\n+ listGffLines.append(gffLine)\n+ \n+ return listGffLines\n+ \n+ def convertSSRPositionsToChromPositions(self, ssrPos, chromPosStart, chromPosEnd, strand):\n+ if strand == \'+\':\n+ newPos = int(chromPosStart) + int(ssrPos) - 1\n+ elif strand == \'-\':\n+ newPos = int(chromPosEnd) - int(ssrPos) + 1\n+ return newPos\n+ \n+ def getSsrSeq(self, motif, nbMotif):\n+ ssrSeq = motif * int(nbMotif)\n+ return ssrSeq\n+ \n+ def _createGFFOutputFile(self):\n+ GFFfile = open(self._outputFileGFF, \'w\')\n+ GFFfile.write("##gff-version 3\\n")\n+ GFFfile.close()\n+ \n+ def _printGFFLinesToOutputFile(self, lLinesToPrint):\n+ GFFfile = open(self._outputFileGFF, \'a\')\n+ for line in lLinesToPrint:\n+ GFFfile.write(line)\n+ GFFfile.close()\n+\n+if __name__ == \'__main__\':\n+ iCrossSsrAndBesMappedByBlatToGff = CrossSsrAndBesMappedByBlatToGff()\n+ iCrossSsrAndBesMappedByBlatToGff.setAttributesFromCmdLine()\n+ iCrossSsrAndBesMappedByBlatToGff.run()\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/ElandParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ElandParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,126 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure import Mapping + +class ElandParser(MapperParser): + """A class that parses ELAND format""" + + def __init__(self, fileName, verbosity = 0): + super(ElandParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(ElandParser, self).__del__() + + + def getFileFormats(): + return ["eland"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + super(ElandParser, self).getInfos() + + + def parseLine(self, line): + + line = line.strip() + + fields = line.split("\t") + if len(fields) < 22: + sys.exit("Line %d '%s' does not look like a ELAND line (number of fields is %d instead of 22)" % (self.currentLineNb, line, len(fields))) + + flowCell = fields[0] + run = fields[1] + lane = fields[2] + tile = fields[3] + xcoord = fields[4] + ycoord = fields[5] + index = fields[6] + number = fields[7] + read = fields[8] + quality = fields[9] + chromosome = fields[10] + contig = fields[11] + position = fields[12] + strand = fields[13] + description = fields[14] + singleScore = fields[15] + pairScore = fields[16] + partnerChromosome = fields[17] + partnerContig = fields[18] + partnerOffset = fields[19] + partnerStrand = fields[20] + filtering = fields[21] + + if number != "1": + sys.exit("S-MART cannot handle pair-end reads yet!") + + # nothing found + if position == "": + return None + + name = "%s_%s:%s:%s:%s:%s#0/1" % (flowCell, run, lane, tile, xcoord, ycoord) + direction = 1 if strand == "F" else -1 + nbMismatches = 0 + for char in description: + if ord("A") <= ord(char) and ord(char) <= ord("Z"): + nbMismatches += 1 + + mapping = Mapping() + mapping.setTagValue("qualityString", quality) + + mapping.queryInterval.setName(name) + mapping.queryInterval.setDirection(direction) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setEnd(len(read)) + + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(int(position)) + mapping.targetInterval.setEnd(int(position) + len(read)) + mapping.targetInterval.setDirection(1) + + mapping.setSize(len(read)) + mapping.setDirection(direction) + + mapping.setNbGaps(0) + mapping.setNbMismatches(nbMismatches) + mapping.setTagValue("score", int(singleScore)) + + if filtering == "Y": + return mapping + # mapping filtered out + return None |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/ExoParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ExoParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,137 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping + +class ExoParser(MapperParser): + """A class that parses the output of Exonerate - roll your own format""" + + def __init__(self, fileName, verbosity = 0): + super(ExoParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(ExoParser, self).__del__() + + + def getFileFormats(): + return ["exo", "exonerate"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + while "Hostname" not in self.handle.readline(): + self.currentLineNb += 1 + pass + + + def parseLine(self, line): + + if line == "-- completed exonerate analysis\n": + return None + + m = re.search(r"^\s*(\S+)\s+(\d+)\s+(\d+)\s+[+-]\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s+(\d+)\s+(\S.*)$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a RYO format" % (self.currentLineNb, line)) + + mapping = Mapping() + name = m.group(1) + queryStart = min(int(m.group(2)), int(m.group(3))) + queryEnd = max(int(m.group(2)), int(m.group(3)))-1 + chromosome = m.group(4) + targetStart = min(int(m.group(5)), int(m.group(6))) + targetEnd = max(int(m.group(5)), int(m.group(6)))-1 + direction = m.group(7) + nbMismatches = int(m.group(8)) + rest = m.group(9).strip() + + nbGaps = 0 + queryOffset = 0 + targetOffset = 0 + + subMapping = None + m = re.search(r"^(\w)\s+(\d+)\s+(\d+)", rest) + while m != None: + queryDistance = int(m.group(2)) + targetDistance = int(m.group(3)) + if m.group(1) == "M": + if subMapping == None: + subMapping = SubMapping() + + subMapping.setSize(queryDistance) + subMapping.setDirection(direction) + + subMapping.queryInterval.setName(name) + subMapping.queryInterval.setStart(queryStart + queryOffset) + subMapping.queryInterval.setDirection(direction) + + subMapping.targetInterval.setChromosome(chromosome) + subMapping.targetInterval.setStart(targetStart + targetOffset) + subMapping.targetInterval.setDirection(1) + + elif m.group(1) == "G": + nbGaps += max(queryDistance, targetDistance) + + elif m.group(1) == "I" or m.group(1) == "5" or m.group(1) == "3": + if subMapping != None: + subMapping.queryInterval.setEnd(queryStart + queryOffset - 1) + subMapping.targetInterval.setEnd(targetStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + subMapping = None + else: + sys.exit("Cannot understand sign '%s' in line %s" % (m.group(1), line)) + + queryOffset += queryDistance + targetOffset += targetDistance + rest = rest[m.end():].strip() + m = re.search(r"^(\w)\s+(\d+)\s+(\d+)", rest) + + if subMapping != None: + subMapping.queryInterval.setEnd(queryStart + queryOffset - 1) + subMapping.targetInterval.setEnd(targetStart + targetOffset - 1) + mapping.addSubMapping(subMapping) + + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(nbGaps) + mapping.setDirection(direction) + + mapping.queryInterval.setName(name) + mapping.queryInterval.setStart(queryStart) + mapping.queryInterval.setEnd(queryEnd) + + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(targetStart) + mapping.targetInterval.setEnd(targetEnd) + + return mapping + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/FastaParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FastaParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,173 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.SequenceListParser import SequenceListParser +from SMART.Java.Python.structure.Sequence import Sequence +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class FastaParser(SequenceListParser): + """A class that reads a list of sequences in FASTA""" + + def __init__(self, fileName, verbosity = 0): + super(FastaParser, self).__init__(fileName, verbosity) + self.tags = {} + + + def getTags(self): + return self.tags + + + def getFileFormats(): + return ["fasta", "mfa", "fas"] + getFileFormats = staticmethod(getFileFormats) + + + def getInfos(self): + """ + Get some generic information about the sequences + """ + self.nbSequences = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity - 9) + for line in self.handle: + line = line.strip() + if line == "": + continue + if line[0] == ">": + self.nbSequences += 1 + else: + self.size += len(line) + progress.inc() + progress.done() + self.reset() + + + def parseOne(self): + """ + Parse only one element in the file + """ + name = None + string = "" + + if self.currentLine != None: + if self.currentLine[0] != ">": + raise Exception("First line is weird: %s" % (self.currentLine)) + name = self.currentLine[1:].split()[0].replace("|", "_").replace(".", "_") + self.currentLine = None + + for line in self.handle: + line = line.strip() + if line == "": + pass + elif line[0] == ">": + if name == None: + name = line[1:].split()[0].replace("|", "_").replace(".", "_") + else: + self.currentLine = line + return Sequence(name, string) + else: + string += line + + if name == None: + return None + return Sequence(name, string) + + + def setTags(self): + mark = self.handle.tell() + thisTag = mark + + line = self.handle.readline() + while line != "": + if line[0] == ">": + line = line.strip() + self.tags[line[1:].split()[0]] = thisTag + thisTag = self.handle.tell() + line = self.handle.readline() + + self.handle.seek(mark) + + + def getSubSequence(self, chromosome, start, end, direction, name = None): + if not self.tags: + self.setTags() + + if chromosome not in self.tags: + raise Exception("Cannot find " + chromosome) + + if name == None: + name = "%s:%d-%d (%d)" % (chromosome, start, end, direction) + sequence = Sequence(name) + + # switch from 0-based to 1-based coordinates + start -= 1 + end -= 1 + + self.handle.seek(self.tags[chromosome]) + line = self.handle.readline().strip() + if line != ">" + chromosome: + raise Exception("Arrived in a wrong place (got %s)" % (line)) + + position1 = self.handle.tell() + line = self.handle.readline().strip() + position2 = self.handle.tell() + size = len(line) + address = position1 + ((start - (start % size)) / size) * (position2 - position1); + + count = max(0, start - (start % size)); + self.handle.seek(address) + + newSequence = "" + for line in self.handle: + line = line.strip() + + if line[0] == ">": + break + + subStart = start - count + if subStart < 0: + subStart = 0 + subEnd = end - count + subSize = subEnd - subStart + 1 + if subSize + subStart > len(line): + subSize = len(line) - subStart + if subEnd < 0: + break + if subStart <= len(line): + newSequence += line[subStart:subStart+subSize] + count += len(line) + + if newSequence == "": + raise Exception("Error, sequence %s is empty" % (name)) + sequence.sequence = newSequence + if direction == -1: + sequence.reverseComplement() + return sequence |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/FastqParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FastqParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,104 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.SequenceListParser import SequenceListParser +from SMART.Java.Python.structure.Sequence import Sequence + +class FastqParser(SequenceListParser): + """A class that reads a list of sequences in FASTQ format""" + + def __init__(self, fileName, verbosity = 0): + super(FastqParser, self).__init__(fileName, verbosity) + + + def getFileFormats(): + return ["fastq", "mfq"] + getFileFormats = staticmethod(getFileFormats) + + + def getInfos(self): + """ + Get some generic information about the sequences + """ + self.nbSequences = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + + nbLines = 0 + for line in self.handle: + line = line.strip() + if line == "": + continue + nbLines += 1 + if self.verbosity >= 10 and nbLines % 400000 == 0: + sys.stdout.write(" %d sequences read\r" % (nbLines / 4)) + sys.stdout.flush() + self.reset() + self.nbSequences = nbLines / 4 + if self.verbosity >= 10: + print " %d sequences read" % (self.nbSequences) + print "Done." + + + def parseOne(self): + """ + Parse only one element in the file + """ + string = "" + quality = "" + lineType = 0 + + for line in self.handle: + line = line.strip() + if lineType == 0: + if line[0] != "@": + raise Exception("Line '%s' should start with '@'!" % (line)) + name = line[1:] + inSequence = True + inQuality = False + elif lineType == 1: + string = line + elif lineType == 2: + if line[0] != "+": + sys.exit("Line '%s' should start with '+'!" % (line)) + if line[1:] != name and line != "+": + sys.exit("Weird difference in sequence and quality names (%s and %s) while parsing FASTQ file %s." % (name, line[1:], self.fileName)) + inQuality = True + inSequence = False + elif lineType == 3: + quality = line + lineType += 1 + if lineType == 4: + sequence = Sequence(name, string) + sequence.setQuality(quality) + return sequence + + return None |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/FindRep.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/FindRep.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,113 @@ +import re +from xml.sax.handler import ContentHandler + +class FindRep( ContentHandler ): + def __init__(self,outfileName, filter=0,count=0): + self.inWindowContent = 0 + self.inSeqNameContent = 0 + self.inStartContent = 0 + self.inEndContent = 0 + self.inPeriodContent = 0 + self.inUnitContent = 0 + self.inScoreContent = 0 + self.count = count + self._outfileName = outfileName + self.filter=filter + + def startDocument(self): + self._fileout = open(self._outfileName,"w") + + def startElement(self,name,attrs): + if name=="window": + self.inWindowContent=1 + elif name=="sequence-name": + self.inSeqNameContent=1 + self.seqname="" + elif name=="repeat": + self.inRepContent=1 + self.start="" + self.end="" + self.period="" + self.type={} + elif name=="start": + self.inStartContent=1 + elif name=="end": + self.inEndContent=1 + elif name=="period": + self.inPeriodContent=1 + elif name=="unit": + self.inUnitContent=1 + self.unit="" + elif name=="score": + self.inScoreContent=1 + self.score="" + + def characters(self,ch): + if self.inSeqNameContent: + self.seqname+=ch + elif self.inStartContent: + self.start+=ch + elif self.inEndContent: + self.end+=ch + elif self.inPeriodContent: + self.period+=ch + elif self.inUnitContent: + self.unit+=ch + elif self.inScoreContent: + self.score+=ch + + def endElement(self,name): + if name=="window": + self.inWindowContent=0 + elif name=="sequence-name": + self.inSeqNameContent=0 + elif name=="repeat": + self.inRepContent=0 + start=int(self.start) + end=int(self.end) + period=int(self.period) + score=float(self.score) + if score>self.filter: + return + max = 0 + self.count+=1 + for k,n in self.type.items(): + if n>max: + max = n + k_max = k + + m=re.match("^[0-9]+.+\{Cut\}",self.seqname) + if m!=None: + seqname=self.seqname[m.start(0):m.end(0)-5].rstrip() + seqname=re.sub("^[0-9]+ ","",seqname).lstrip() + tok=self.seqname[m.end(0):].split("..") + astart=start+int(tok[0])-1 + aend=end+int(tok[0])-1 + else: + astart=start + aend=end + seqname=self.seqname + if len(k_max) > 100: + k_max=k_max[:48]+"..."+k_max[-51:] + strout="%d\t(%s)%d\t%s\t%d\t%d"%\ + (self.count,k_max,(abs(start-end)+1)/period,\ + seqname,astart,aend) + self._fileout.write("%s\n"%(strout)) + + elif name=="start": + self.inStartContent=0 + elif name=="end": + self.inEndContent=0 + elif name=="period": + self.inPeriodContent=0 + elif name=="score": + self.inScoreContent=0 + elif name=="unit": + self.inUnitContent=0 + if self.type.has_key(self.unit): + self.type[self.unit]+=1 + else: + self.type[self.unit]=1 + + def endDocument(self): + self._fileout.close() \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/GbParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/GbParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,111 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GbParser(TranscriptListParser): + """A class that parses a GBrowse file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + self.reference = None + self.color = None + super(GbParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GbParser, self).__del__() + + + def getFileFormats(): + return ["gb", "gbrowse"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + for line in self.handle: + self.currentLineNb += 1 + line = line.strip() + m = re.search(r"^\s*bgcolor\s*=\s*(\S+)\s*$", line) + if m != None: + self.color = m.group(1) + if line == "": + return + + + def parseLine(self, line): + transcript = Transcript() + # first line (reference) + m = re.search(r"^\s*reference\s*=\s*(\S+)\s*$", line) + if m != None: + self.reference = m.group(1) + for line in self.handle: + line = line.strip() + self.currentLineNb += 1 + break + # second line (genomic coordinates) + m = re.search(r"^\s*READS\s+(\S+)\s+(\S+)\s+\"([^\"]*)\"\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a GBrowse format" % (self.currentLineNb, line)) + if self.reference == None: + sys.exit("Cannot get reference of GBrowse line %d '%s'" % (self.currentLineNb, line)) + transcript.setChromosome(self.reference) + transcript.setName(m.group(1)) + transcript.setComment(m.group(3)) + # exons + exons = m.group(2).split(",") + transcriptStart = 1000000000 + transcriptEnd = 0 + direction = 0 + for exon in exons: + m = re.search(r"^(\d+)-(\d+)$", exon) + if m == None: + sys.exit("\nCannot read GBrowse exon line %d '%s'" % (self.currentLineNb, exon)) + interval = Interval() + interval.setChromosome(transcript.chromosome) + direction += int(m.group(2)) - int(m.group(1)) + start = min(int(m.group(1)), int(m.group(2))) + end = max(int(m.group(1)), int(m.group(2))) + interval.setStart(start) + interval.setEnd(end) + transcriptStart = min(transcriptStart, start) + transcriptEnd = max(transcriptEnd, end) + transcript.addExon(interval) + transcript.setStart(transcriptStart) + transcript.setEnd(transcriptEnd) + transcript.setDirection(direction) + for exon in transcript.getExons(): + exon.setDirection(direction) + return transcript + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/GffParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/GffParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,149 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GffParser(TranscriptListParser): + """A class that parses a GFF file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + super(GffParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GffParser, self).__del__() + + + def getFileFormats(): + return ["gff", "gff2", "gff3"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getInfos(self): + self.chromosomes = set() + self.nbTranscripts = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + self.reset() + for line in self.handle: + line = line.strip() + if line == "" or line[0] == "#": + continue + parts = line.split("\t") + if len(parts) != 9: + raise Exception("Error! Line '%s' has %d tab-separated fields instead of 9!" % (line, len(parts))) + self.chromosomes.add(parts[0]) + if parts[8].find("Parent") == -1: + self.nbTranscripts += 1 + else: + self.size += max(int(parts[3]), int(parts[4])) - min(int(parts[3]), int(parts[4])) + 1 + if self.verbosity >= 10 and self.nbTranscripts % 100000 == 0: + sys.stdout.write(" %d transcripts read\r" % (self.nbTranscripts)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d transcripts read" % (self.nbTranscripts) + print "Done." + + + def parseLine(self, line): + if not line or line[0] == "#": + return None + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-.])\s+(\S+)\s+(\S.*)$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a GFF format\n" % (self.currentLineNb, line)) + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + interval.setTagValue("feature", m.group(3)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + posSpace = remaining.find(" ") + posEqual = remaining.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + parts = remaining.split("=") + else: + parts = remaining.split() + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field in ("Name", "name", "Sequence", "TE", "SAT"): + interval.setName(value) + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if "Parent" in interval.getTagNames(): + if self.currentTranscript == None: + raise Exception("GFF file does not start with a transcript! First line is '%s'." % (line.strip())) + if interval.getTagValue("Parent") != self.currentTranscript.getTagValue("ID"): + raise Exception("Exon '%s' is not right after its transcript in GFF file!" % (interval)) + self.currentTranscript.addExon(interval) + if interval.name == None: + interval.name = self.currentTranscript.name + return None + + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.previousTranscriptAddress = self.currentAddress + + if transcript != None and transcript.name.startswith("unnamed"): + if "ID" in transcript.getTagNames(): + transcript.name = transcript.getTagValue("ID") + else: + transcript.name = "unnamed transcript %s" % (self.currentLineNb) + return transcript |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/GtfParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/GtfParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,113 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class GtfParser(TranscriptListParser): + """A class that parses a GTF file and create a transcript list""" + + + def __init__(self, fileName, verbosity = 0): + super(GtfParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(GtfParser, self).__del__() + + + def getFileFormats(): + return ["gtf", "gtf2"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + if line[0] == "#": + return None + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+([+-.])\s+(\S+)\s+(\S.*)$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a GTF format\n" % (self.currentLineNb, line)) + interval = Interval() + interval.setChromosome(m.group(1)) + interval.setName("unnamed transcript") + interval.setStart(min(int(m.group(4)), int(m.group(5)))) + interval.setEnd(max(int(m.group(4)), int(m.group(5)))) + if m.group(7) == ".": + interval.setDirection("+") + else: + interval.setDirection(m.group(7)) + if m.group(6).isdigit(): + interval.setTagValue("score", m.group(6)) + type = m.group(3) + + if type not in ("transcript", "exon"): + return None + + remainings = m.group(9).split(";") + for remaining in remainings: + remaining = remaining.strip() + if remaining == "": + continue + parts = remaining.split(" ", 1) + field = parts[0].strip() + value = " ".join(parts[1:]).strip(" \"") + if field == "transcript_id": + interval.setTagValue("ID", value) + elif field == "gene_name": + interval.setName(value) + elif field == "transcript_name": + interval.setName(value) + elif field == "exon_number": + continue + else: + try: + intValue = int(value) + interval.setTagValue(field, intValue) + except ValueError: + interval.setTagValue(field, value) + + self.currentTranscriptAddress = self.previousTranscriptAddress + if self.currentTranscript == None or interval.getTagValue("ID") != self.currentTranscript.getTagValue("ID"): + transcript = self.currentTranscript + self.currentTranscript = Transcript() + self.currentTranscript.copy(interval) + self.currentTranscript.setTagValue("feature", "transcript") + self.previousTranscriptAddress = self.currentAddress + return transcript + if type == "exon": + self.currentTranscript.addExon(interval) + return None |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/MapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MapParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,67 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.parsing.TranscriptListParser import TranscriptListParser + + +class MapParser(TranscriptListParser): + """A class that parses the repet .map files""" + + def __init__(self, fileName, verbosity = 0): + self._lineParseRe = re.compile(r"(?P<seqName>\w+)\s(?P<chrName>\w+)\s(?P<sStart>\d+)\s(?P<sEnd>\d+)") + TranscriptListParser.__init__(self, fileName, verbosity) + + def getFileFormats(): + return ["map"] + getFileFormats = staticmethod(getFileFormats) + + def skipFirstLines(self): + return + + def parseLine(self, line): + m = self._lineParseRe.search(line) + + if m == None: + sys.exit("\nLine %d '%s' does not have a map format" % (self.currentLineNb, line)) + + transcript = Transcript() + transcript.setChromosome(m.group("chrName")) + transcript.setStart(min(int(m.group("sStart")), int(m.group("sEnd")))) + transcript.setEnd(max(int(m.group("sStart")), int(m.group("sEnd")))) + transcript.setName(m.group("seqName")) + transcript.setDirection(1) + + return transcript |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/MapperParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MapperParser.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,129 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.Mapping import Mapping + + +class MapperParser(object): + """An interface that parses the output of a generic mapper""" + + def __init__(self, fileName, verbosity = 0): + super(MapperParser, self).__init__() + self.verbosity = verbosity + self.nbMappings = None + self.chromosomes = None + self.size = None + self.currentMapping = Mapping() + self.handle = open(fileName) + self.currentLineNb = 0 + self.skipFirstLines() + self.fileName = fileName + self.startingPoint = self.handle.tell() + + + def __del__(self): + self.handle.close() + + + def reset(self): + self.handle.seek(self.startingPoint) + self.currentLineNb = 0 + + + def getNextMapping(self): + for line in self.handle: + mapping = self.parseLine(line) + self.currentLineNb += 1 + if mapping != None: + return mapping + return False + + + def getIterator(self): + self.reset() + mapping = self.getNextMapping() + while mapping: + yield mapping + mapping = self.getNextMapping() + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information." + for mapping in self.getIterator(): + transcript = mapping.getTranscript() + self.chromosomes.add(transcript.getChromosome()) + self.nbMappings += 1 + self.size += transcript.getSize() + if self.verbosity >= 10 and self.nbMappings % 100000 == 0: + sys.stdout.write(" %d mappings read\r" % (self.nbMappings)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d mappings read" % (self.nbMappings) + print "Done." + + + def getNbMappings(self): + if self.nbMappings != None: + return self.nbMappings + self.getInfos() + return self.nbMappings + + + def getNbItems(self): + return self.getNbMappings() + + + def getChromosomes(self): + if self.chromosomes != None: + return self.chromosomes + self.getInfos() + return self.chromosomes + + + def getSize(self): + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getNbNucleotides(self): + return self.getSize() + + + def setDefaultTagValue(self, name, value): + for mapping in self.getIterator(): + mapping.setTagValue(name, value) |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/MaqParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MaqParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,77 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser + + +class MaqParser(MapperParser): + """A class that parses the output of Maq""" + + def __init__(self, fileName, verbosity = 0): + super(MaqParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(MaqParser, self).__del__() + + + def getFileFormats(): + return ["maq"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\S+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a MAQ format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.targetInterval.setStart(int(m.group(3))) + mapping.targetInterval.setSize(int(m.group(14))) + mapping.targetInterval.setChromosome(m.group(2)) + + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(int(m.group(14))) + mapping.queryInterval.setName(m.group(1)) + + mapping.setDirection(m.group(4)) + mapping.setSize(int(m.group(14))) + mapping.setNbMismatches(int(m.group(10))) + mapping.setRank(1) + mapping.setNbOccurrences(int(m.group(12)) + int(m.group(13))) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/MrepsToSet.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MrepsToSet.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,31 @@ +from commons.core.parsing.FindRep import FindRep +from xml.sax import make_parser +from xml.sax.handler import feature_namespaces +import os + + +class MrepsToSet(object): + + def __init__(self, mrepsInputFileName="", mrepsOuputFileName="", outputFileName=None, errorFilter=0): + self._mrepsInputFileName = mrepsInputFileName + self._mrepsOuputFileName = mrepsOuputFileName + self._outputFileName = outputFileName or "%s.Mreps.set" % mrepsOuputFileName + self._errorFilter = errorFilter + + def run(self): + xmlParser = make_parser() + xmlParser.setFeature( feature_namespaces, 0 ) + xmlParser.setContentHandler( FindRep( self._outputFileName, self._errorFilter, 0 ) ) + xmlParser.parse( self._mrepsOuputFileName ) + + def clean( self ): + """ + Remove the output file (xml) from Mreps to keep only the 'set' file. + """ + if os.path.exists(self._mrepsOuputFileName): + os.remove(self._mrepsOuputFileName) + + + + + \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/Multifasta2SNPFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/Multifasta2SNPFile.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,846 @@\n+import re\n+import os\n+import logging\n+from commons.core.utils.FileUtils import FileUtils\n+from commons.core.seq.BioseqDB import BioseqDB\n+from commons.core.seq.Bioseq import Bioseq\n+from commons.core.LoggerFactory import LoggerFactory\n+\n+DNA_ALPHABET_WITH_N_AND_DELS = set ([\'A\',\'T\',\'G\',\'C\',\'N\',\'-\'])\n+IUPAC = set([\'A\',\'T\',\'G\',\'C\',\'U\',\'R\',\'Y\',\'M\',\'K\',\'W\',\'S\',\'B\',\'D\',\'H\',\'V\',\'N\', \'-\', \'a\',\'t\',\'g\',\'c\',\'u\',\'r\',\'y\',\'m\',\'k\',\'w\',\'s\',\'b\',\'d\',\'h\',\'v\',\'n\'])\n+\n+class Multifasta2SNPFile( object ):\n+\n+ POLYM_TYPE_4_SNP = "SNP"\n+ POLYM_TYPE_4_INSERTION = "INSERTION"\n+ POLYM_TYPE_4_DELETION = "DELETION"\n+ POLYM_DEFAULT_CONFIDENCE_VALUE = "A"\n+ SNP_LENGTH = 1\n+ FLANK_LENGTH = 250\n+ \n+ def __init__(self, taxon, batchName="", geneName=""):\n+ \n+ if(batchName):\n+ self._batchName = batchName\n+ \n+ if(geneName):\n+ self._geneName = geneName\n+\n+ self._taxon = taxon\n+ self._outSubSNPFileName = "SubSNP.csv"\n+ self._outAlleleFileName = "Allele.csv"\n+ self._outIndividualFileName = "Individual.csv"\n+ self._outSequenceFSAFileName = "Sequences.fsa"\n+ self._outSequenceCSVFileName = "Sequences.csv"\n+ self._outBatchFileName = "Batch.txt"\n+ self._outBatchLineFileName = "BatchLine.csv"\n+ self._logFileName = "multifasta2SNP.log"\n+ \n+ self._lBatchFileResults = []\n+ self._lSubSNPFileResults = []\n+ self._lRefSequences = []\n+ self._lIndividualFileResults = []\n+ self._lBatchLineFileResults = []\n+ self._dIndividualNumbers4SubSNPResults = {}\n+ self._dAlleleFileResults = {}\n+ \n+ \n+ self.dcurrentIndel = {}\n+ self.lIndelsOfTheCurrentLine = []\n+ self.lIndelsOverAllLines = []\n+ self.dSNPsPositions = {}\n+ \n+ self._iCurrentLineNumber = 0\n+ self._currentBatchNumber = 1\n+ self.currentLineName = ""\n+ self.currentNucleotide = ""\n+ self.currentPosition = 0\n+ self._sPolymConfidenceValue = Multifasta2SNPFile.POLYM_DEFAULT_CONFIDENCE_VALUE \n+ self._sPolymType = Multifasta2SNPFile.POLYM_TYPE_4_SNP\n+ self._iPolymLength = Multifasta2SNPFile.SNP_LENGTH\n+ self._fileUtils = FileUtils()\n+ \n+ if self._fileUtils.isRessourceExists(self._logFileName):\n+ os.remove(self._logFileName)\n+ self._logFile = LoggerFactory.createLogger(self._logFileName, logging.INFO, "%(asctime)s %(levelname)s: %(message)s")\n+ \n+ def runOneBatch( self, inFileName):\n+ self._currentFileName = inFileName\n+ #TODO: methode a virer; n\'utiliser au final que runOneBatchWithoutWriting\n+ self._wrapper = self.createWrapperFromFile(inFileName)\n+ self._lBatchFileResults = self.completeBatchList()\n+ self.detectSNPsAndIndels(self._wrapper) \n+ self._writeAllOutputFiles()\n+ self._currentBatchNumber += 1\n+ \n+ def runOneBatchWithoutWriting( self, inFileName):\n+ self.lIndelsOverAllLines = []\n+ self._currentFileName = inFileName\n+ self._wrapper = self.createWrapperFromFile(inFileName)\n+ self._lBatchFileResults = self.completeBatchList()\n+ self.detectSNPsAndIndels(self._wrapper) \n+ self._currentBatchNumber += 1\n+ \n+\n+ def _cleanOutputsInTheCurrentDir(self):\n+ #TODO: create a list of files to be deleted\n+ FileUtils.removeFilesByPattern("*.csv")\n+ if (FileUtils.isRessourceExists(self._outBatchFileName)):\n+ os.remove(self._outBatchFileName)\n+ if (FileUtils.isRessourceExists(self._outSequenceFSAFileName)):\n+ os.remove(self._outSequenceFSAFileName)\n+\n+\n+ def _createOutputObjectsIteratingOnCurrentDir(self):\n+ #TODO: gerer les extensions multiples\n+ extList = [".fasta", ".fsa"]\n+ for dirname, dirnames, filenames in os.walk("."):\n+ filenames.sort()\n+ for filename in filenames:\n+ '..b'elf, batchLineFileName, lBatchLineResults):\n+ outF = open(batchLineFileName, "w")\n+ self._writeBatchLineFileHeader(outF)\n+ for dResult in lBatchLineResults:\n+ self._writeBatchLineFileLine(outF, dResult)\n+ outF.close()\n+ \n+ def _writeSNPFileHeader(self, outF):\n+ for head in Multifasta2SNPFileWriter.SUB_SNP_FILE_HEADER[:-1]:\n+ outF.write(head + self._csvFieldSeparator)\n+ outF.write(Multifasta2SNPFileWriter.SUB_SNP_FILE_HEADER[-1] + self._csvLineSeparator)\n+ \n+ def _writeAlleleFileHeader(self, outF):\n+ for head in Multifasta2SNPFileWriter.ALLELE_FILE_HEADER[:-1]:\n+ outF.write(head + self._csvFieldSeparator)\n+ outF.write(Multifasta2SNPFileWriter.ALLELE_FILE_HEADER[-1] + self._csvLineSeparator)\n+ \n+ def _writeIndividualFileHeader(self, outF):\n+ for head in Multifasta2SNPFileWriter.INDIVIDUAL_FILE_HEADER[:-1]:\n+ outF.write(head + self._csvFieldSeparator)\n+ outF.write(Multifasta2SNPFileWriter.INDIVIDUAL_FILE_HEADER[-1] + self._csvLineSeparator)\n+ \n+ def _writeSequenceCSVHeader(self, outF):\n+ for head in Multifasta2SNPFileWriter.SEQUENCE_CSV_FILE_HEADER[:-1]:\n+ outF.write(head + self._csvFieldSeparator)\n+ outF.write(Multifasta2SNPFileWriter.SEQUENCE_CSV_FILE_HEADER[-1] + self._csvLineSeparator)\n+ \n+ def _writeBatchLineFileHeader(self, outF):\n+ for head in Multifasta2SNPFileWriter.BATCH_LINE_FILE_HEADER[:-1]:\n+ outF.write(head + self._csvFieldSeparator)\n+ outF.write(Multifasta2SNPFileWriter.BATCH_LINE_FILE_HEADER[-1] + self._csvLineSeparator) \n+ \n+ def _writeSNPFileLine(self, outF, dSNP):\n+ outF.write(dSNP[\'subSNPName\'] + self._csvFieldSeparator)\n+ outF.write(dSNP[\'confidenceValue\'] + self._csvFieldSeparator + dSNP[\'type\'] + self._csvFieldSeparator)\n+ outF.write(str(dSNP[\'position\']) + self._csvFieldSeparator + dSNP[\'5flank\'] + self._csvFieldSeparator + dSNP[\'3flank\'] + self._csvFieldSeparator)\n+ outF.write(str(dSNP[\'length\']) + self._csvFieldSeparator + str(dSNP[\'batchNumber\']) + self._csvFieldSeparator)\n+ outF.write(str(dSNP[\'lineName\']) + self._csvFieldSeparator)\n+ outF.write(self._primerType + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + str(dSNP[\'allele\']) + self._csvLineSeparator)\n+\n+ def _writeAlleleFileLine(self, outF, sAllele2Write, iAlleleNumber):\n+ outF.write(str(iAlleleNumber) + self._csvFieldSeparator)\n+ outF.write(sAllele2Write + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvLineSeparator)\n+ \n+ def _writeIndividualFileLine(self, outF, dIndividual):\n+ outF.write(str(dIndividual[\'individualNumber\']) + self._csvFieldSeparator)\n+ outF.write(dIndividual[\'individualName\'] + self._csvFieldSeparator + self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator+ self._csvFieldSeparator)\n+ outF.write(dIndividual[\'scientificName\'] + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator+ self._csvFieldSeparator + self._csvFieldSeparator + self._csvLineSeparator)\n+ \n+ def _writeSequenceCSVLine(self, outF, refSeq, taxon):\n+ outF.write(refSeq.header + self._csvFieldSeparator)\n+ outF.write("Reference" + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator)\n+ outF.write(taxon + self._csvLineSeparator) \n+ \n+ def _writeBatchLineFileLine(self, outF, dResult):\n+ outF.write(str(dResult[\'IndividualNumber\']) + self._csvFieldSeparator + self._csvFieldSeparator + self._csvFieldSeparator)\n+ outF.write(str(dResult[\'BatchNumber\']) + self._csvFieldSeparator + self._csvLineSeparator)\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/MummerParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/MummerParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,93 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping + +class MummerParser(MapperParser): + """A class that parses the output of Mummer format""" + + def __init__(self, fileName, verbosity = 0): + super(MummerParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(MummerParser, self).__del__() + + + def getFileFormats(): + return ["mummer"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + mapping = Mapping() + + subMapping = SubMapping() + + # handle header + m = re.search(r"^>\s+(\S+)\s+Reverse\s+Len\s+=\s+(\d+)$", line) + if m != None: + subMapping.queryInterval.setName(m.group(1)) + subMapping.queryInterval.setSize(int(m.group(2))) + subMapping.queryInterval.setDirection(-1) + else: + m = re.search(r"^>\s+(\S+)\s+Len\s+=\s+(\d+)$", line) + if m != None: + subMapping.queryInterval.setName(m.group(1)) + subMapping.queryInterval.setSize(int(m.group(2))) + subMapping.queryInterval.setDirection(1) + else : + sys.exit("Header line %d '%s' is strange in Mummer file" % (self.currentLineNb, line)) + + for line in self.handle: + self.currentLineNb += 1 + break + line = line.strip() + + # handle line + m = re.search(r"^(\w+)\s+(\d+)\s+(\d+)\s+(\d+)$", line) + if m != None: + subMapping.targetInterval.setName(m.group(1)) + subMapping.targetInterval.setStart(int(m.group(2))) + subMapping.queryInterval.setStart(int(m.group(3))) + subMapping.targetInterval.setSize(int(m.group(4))) + else: + sys.exit("Line %d '%s' is strange in Mummer file" % (self.currentLineNb, line)) + + mapping.addSubMapping(subMapping) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/NCListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/NCListParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,125 @@ +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +try: + import cPickle as pickle +except: + import pickle + + +class NCListParser(TranscriptListParser): + + + def __init__(self, fileName, verbosity = 0): + self.title = None + TranscriptListParser.__init__(self, fileName, verbosity) + self.parse() + + def getFileFormats(): + return ["nclist"] + getFileFormats = staticmethod(getFileFormats) + + def skipFirstLines(self): + return + + def parse(self): + handle = open(self.fileName) + self.sortedFileNames = pickle.load(handle) + self.nbElements = pickle.load(handle) + self.nbElementsPerChromosome = pickle.load(handle) + self.ncLists = pickle.load(handle) + for ncList in self.ncLists.values(): + ncList._reopenFiles() + handle.close() + self.chromosomes = sorted(self.nbElementsPerChromosome.keys()) + self.fileNames = dict([chromosome, self.ncLists[chromosome]._transcriptFileName] for chromosome in self.chromosomes) + self.currentReader = None + self.currentChrIndex = 0 + + def getSortedFileNames(self): + return self._sortedFileNames + + def getNbElements(self): + return self._nbElements + + def getNbElementsPerChromosome(self): + return self._nbElementsPerChromosome + + def getNCLists(self): + return self._ncLists + + def reset(self): + self.currentChrIndex = 0 + self.currentReader = None + + def gotoAddress(self, address): + self.currentReader.gotoAddress(address) + + def getCurrentAddress(self): + return self.getCurrentTranscriptAddress() + + def getCurrentTranscriptAddress(self): + if self.currentReader == None: + return 0 + return self.currentReader.getCurrentTranscriptAddress() + + def getNextTranscript(self): + if self.currentReader == None: + self.currentReader = NCListFileUnpickle(self.fileNames[self.chromosomes[0]]) + transcript = self.currentReader.getNextTranscript() + if transcript == False: + self.currentChrIndex += 1 + if self.currentChrIndex >= len(self.chromosomes): + return None + self.currentReader = NCListFileUnpickle(self.fileNames[self.chromosomes[self.currentChrIndex]]) + transcript = self.currentReader.getNextTranscript() + return transcript + + def getInfos(self): + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting information on %s." % (self.fileName), self.verbosity-9) + transcript = self.getNextTranscript() + for transcript in self.getIterator(): + self.size += transcript.getSize() + progress.inc() + progress.done() + self.reset() + + def getNbTranscripts(self): + return self.nbElements |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/NucmerParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/NucmerParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,88 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.Interval import Interval +from commons.core.parsing.MapperParser import MapperParser + + +class NucmerParser(MapperParser): + """A class that parses the output of Nucmer""" + + def __init__(self, fileName, verbosity = 0): + super(NucmerParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(NucmerParser, self).__del__() + + + def getFileFormats(): + return ["nucmer"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + if not line: + return None + if line[0] == ">": + self.currentChromosome = line[1:].split()[0] + return None + splittedLine = line.strip().split() + if len(splittedLine) != 8: + raise Exception("Line %d '%s' does not have a NucMer format" % (self.currentLineNb, line)) + + subMapping = SubMapping() + + subMapping.targetInterval.setChromosome(self.currentChromosome) + subMapping.targetInterval.setName(self.currentChromosome) + subMapping.targetInterval.setStart(min(int(splittedLine[0]), int(splittedLine[1]))) + subMapping.targetInterval.setEnd(max(int(splittedLine[0]), int(splittedLine[1]))) + subMapping.targetInterval.setDirection(splittedLine[6]) + + subMapping.queryInterval.setChromosome(splittedLine[7]) + subMapping.queryInterval.setName(splittedLine[7]) + subMapping.queryInterval.setStart(1) + subMapping.queryInterval.setEnd(int(splittedLine[3])) + subMapping.queryInterval.setDirection("+") + + mapping = Mapping() + mapping.addSubMapping(subMapping) + mapping.setDirection(splittedLine[6]) + mapping.setIdentity(float(splittedLine[5])) + mapping.setSize(int(splittedLine[3])) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/PalsToAlign.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PalsToAlign.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,66 @@ +import time +import os + +class PalsToAlign(object): + """ + Convert the output from PALS (GFF2 format) into the 'align' format. + """ + def __init__(self,inputPalsFileName="" , outputAlignFileName="", removeSameSequences=False): + self._removeSameSequences = removeSameSequences + self._inputPalsFileName = inputPalsFileName + self._outputAlignFileName = outputAlignFileName + + def run (self): + file = open(self._inputPalsFileName, "r") + tmpFileName = "PalsToAlign%s"%str(os.getpid() ) + tmpFile = open(tmpFileName, "w") + + for line in file.readlines(): + + if line == "": + break + + data = line.split("\t") + + qryName = data[0] + source = data[1] + feature = data[2] + qryStart = data[3] + qryEnd = data[4] + score = data[5] + strand = data[6] + frame = data[7] + attributes = data[8][:-1].split() + + sbjName = attributes[1] + sbjStart = attributes[2] + sbjEnd = attributes[3][:-1] + percId = (1 - float(attributes[-1])) * 100.0 + + if strand != "+": + tmp = sbjStart + sbjStart = sbjEnd + sbjEnd = tmp + + if self._removeSameSequences \ + and "chunk" in qryName and "chunk" in sbjName \ + and min(int(qryStart), int(qryEnd)) == 1 \ + and min(int(sbjStart), int(sbjEnd)) == 1 \ + and percId == 100.0: + line = self.inFile.readline() + continue + + if qryStart < qryEnd: + alignLine = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (qryName, qryStart, qryEnd, sbjName, sbjStart, sbjEnd, "0.0", score, percId) + else: + alignLine = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (qryName, qryEnd, qryStart, sbjName, sbjEnd, sbjStart, "0.0", score, percId) + + tmpFile.write(alignLine) + + file.close() + tmpFile.close() + + os.system("sort -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n %s > %s" % (tmpFileName, self._outputAlignFileName)) + os.remove(tmpFileName) + + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/ParserChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ParserChooser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,129 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from commons.core.parsing.MapperParser import MapperParser +from commons.core.parsing.SequenceListParser import SequenceListParser +from commons.core.parsing.BedParser import BedParser +from commons.core.parsing.GffParser import GffParser +from commons.core.parsing.MapperParser import MapperParser +from commons.core.parsing.CoordsParser import CoordsParser +from commons.core.parsing.SeqmapParser import SeqmapParser +from commons.core.parsing.SoapParser import SoapParser +from commons.core.parsing.Soap2Parser import Soap2Parser +from commons.core.parsing.BlastParser import BlastParser +from commons.core.parsing.PslParser import PslParser +from commons.core.parsing.RmapParser import RmapParser +from commons.core.parsing.ShrimpParser import ShrimpParser +from commons.core.parsing.AxtParser import AxtParser +from commons.core.parsing.ExoParser import ExoParser +from commons.core.parsing.MaqParser import MaqParser +from commons.core.parsing.SamParser import SamParser +from commons.core.parsing.BamParser import BamParser +from commons.core.parsing.BowtieParser import BowtieParser +from commons.core.parsing.ElandParser import ElandParser +from commons.core.parsing.GtfParser import GtfParser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.parsing.MapParser import MapParser +from commons.core.parsing.WigParser import WigParser +from commons.core.parsing.NCListParser import NCListParser +from commons.core.parsing.PklParser import PklParser + +#Attention!! Do not delete the imports!! They are used to know the type of file format!!! + +class ParserChooser(object): + """ + A class that finds the correct parser + @ivar format: the format + @type format: string + @ivar type: transcript / mapping / sequence parser + @type type: string + @ivar parser: the parser + @type parser: object + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.type = None + self.parserClass = None + self.verbosity = verbosity + + + def findFormat(self, format, type = None): + """ + Find the correct parser + @ivar format: the format + @type format: string + @ivar type: transcript / mapping / sequence parser (None is all) + @type type: string + @return: a parser + """ + classes = {} + if (type == "transcript"): + classes = {TranscriptListParser: "transcript"} + elif (type == "mapping"): + classes = {MapperParser: "mapping"} + elif (type == "sequence"): + classes = {SequenceListParser: "sequence"} + elif (type == None): + classes = {TranscriptListParser: "transcript", MapperParser: "mapping", SequenceListParser: "sequence"} + else: + raise Exception("Do not understand format type '%s'" % (type)) + + for classType in classes: + for parserClass in classType.__subclasses__(): + if format in parserClass.getFileFormats(): + self.parserClass = parserClass + self.type = classes[classType] + return + raise Exception("Cannot get parser for format '%s'" % (format)) + + + def getParser(self, fileName): + """ + Get the parser previously found + @return: the parser + """ + return self.parserClass(fileName, self.verbosity) + + + def getType(self): + """ + Get the type of parser previously found + @return: the type of parser + """ + return self.type |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/PathNum2Id.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PathNum2Id.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,47 @@ +class PathNum2Id( object ): + + def __init__(self): + self._inFileName = None + self._outFileName = None + + def setInFileName(self, fileName): + self._inFileName = fileName + + def setOutFileName(self, fileName): + self._outFileName = fileName + + def run( self ): + """ + Adapt the path IDs as the input file is the concatenation of several 'path' files. + """ + self._inFile = open( self._inFileName, "r" ) + self._outFile = open( self._outFileName, "w" ) + lines = self._inFile.readlines() + dID2count = {} + count = 1 + for line in lines: + if line == "": + break + strippedLine = line.strip('\n') + data = strippedLine.split("\t") + path = data[0] + qryName = data[1] + qryStart = int(data[2]) + qryEnd = int(data[3]) + sbjName = data[4] + sbjStart = int(data[5]) + sbjEnd = int(data[6]) + BLAST_Eval = data[7] + BLAST_score = data[8] + percId = data[9] + key_id = path + "-" + qryName + "-" + sbjName + if key_id not in dID2count.keys(): + newPath = count + count += 1 + dID2count[ key_id ] = newPath + else: + newPath = dID2count[ key_id ] + cmd = "%i\t%s\t%i\t%i\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( newPath, qryName, qryStart, qryEnd, sbjName, sbjStart, sbjEnd, BLAST_Eval, BLAST_score, percId ) + self._outFile.write( cmd ) + self._inFile.close() + self._outFile.close() |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/PilerTAToGrouperMap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PilerTAToGrouperMap.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,85 @@ +import time +import os + +class PilerTAToGrouperMap(object): + """ + Convert the output file from Piler into grouper format. + """ + def __init__(self, inputGffFileName, inputPYRFileName, inputMOTIFFileName, outputFileName): + self._inputGffFileName = inputGffFileName + self._inputPYRFileName = inputPYRFileName + self._inputMOTIFFileName = inputMOTIFFileName + self._outFileName = outputFileName + + def run (self): + inFileGff = open( self._inputGffFileName, "r" ) + inFilePyr = open( self._inputPYRFileName, "r" ) + outFile = open(self._outFileName,"w") + + #step 0 : get pile Info and write out an info file + for pyrLine in inFilePyr.readlines():#-tan_pyr.gff + if pyrLine == "": + break + pileIndex = "" + pyrIndex = pyrLine.split('\t')[8].replace ('PyramidIndex', 'Pyramid') + for gffLine in inFileGff.readlines(): #-tan.gff + if gffLine == "": + break + if pyrIndex in gffLine: + pileIndex = gffLine.split(';')[1].strip() + break + line = "%s\t%s" % (pileIndex, pyrIndex) + outFile.write(line) + + inFilePyr.close() + inFileGff.close() + outFile.close() + + #Step 1 : Add pile info to motif file and write out two files one with grouperID and one in map format + outFileMotifGrpFileName = self._inputMOTIFFileName + ".grp" + outFileMotifGrpMapFileName = self._inputMOTIFFileName + ".grp.map" + + inFileInfo = open(self._outFileName,"r") + inFileMotif = open(self._inputMOTIFFileName, "r" ) + outFileMotifGrp = open(outFileMotifGrpFileName, "w" ) + outFileMotifGrpMap = open(outFileMotifGrpMapFileName, "w" ) + + inFileInfos = inFileInfo.readlines() + lineInfoIndex = 0 + + for countMotif,lineMotif in enumerate(inFileMotif.readlines()): + if lineMotif == "": + break + dataMotif = lineMotif.split(';') + motif, pyrNameMotif = dataMotif[:2] + pyrNameMotif = pyrNameMotif.strip() + pileNameMotif = "" + + while lineInfoIndex < len(inFileInfos): + lineInfo = inFileInfos[lineInfoIndex] + if lineInfo == "": + break + if pyrNameMotif in lineInfo: + pileNameMotif = lineInfo.split('\t')[0] + break + lineInfoIndex +=1 + + #translate to Grouper IdFormat + pyrID = pyrNameMotif.split(' ')[1] + pileID = pileNameMotif.split(' ')[1] + dataMotif = motif.split ('\t') + chrm = dataMotif [0] + start,end = dataMotif [3:5] + countMotif += 1 + memberID = "MbS%sGr" % (countMotif) + pyrID + "Cl" + pileID + + stringMotif = "%s\t%s\t%s\t%s\n" % ( memberID, motif, pileNameMotif, pyrNameMotif) + outFileMotifGrp.write( stringMotif) + + stringGrpMap = "%s\t%s\t%s\t%s\n" % ( memberID, chrm, start, end ) + outFileMotifGrpMap.write( stringGrpMap ) + + inFileMotif.close() + inFileInfo.close() + outFileMotifGrp.close() + outFileMotifGrpMap.close() |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/PklParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PklParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,112 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +try: + import cPickle as pickle +except: + import pickle +from SMART.Java.Python.structure.Interval import Interval +from commons.core.parsing.TranscriptListParser import TranscriptListParser +from SMART.Java.Python.structure.Transcript import Transcript + + +class PklParser(TranscriptListParser): + """A class that parses the intern PKL file and create a transcript list""" + + def __init__(self, fileName, verbosity = 1): + self.title = None + super(PklParser, self).__init__(fileName, verbosity) + self.handle = open(fileName, "rb") + self.verbosity = verbosity + self.initAddress = 0 + self.address = self.initAddress + self.over = False + self.chromosome = None + + def __del__(self): + super(PklParser, self).__del__() + + def getFileFormats(): + return ["pkl"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + return + + + def reset(self): + self.handle.seek(0) + self.initAddress = 0 + + + def setChromosome(self, chromosome): + self.chromosome = chromosome + + + def gotoAddress(self, address): + self.handle.seek(address) + self.address = address + + + def getNextTranscript(self): + self.address = self.handle.tell() + try: + transcript = pickle.load(self.handle) + if self.chromosome != None and transcript.getChromosome() != self.chromosome: + self.over = True + return False + return transcript + except EOFError: + self.over = True + return False + + + def getIterator(self): + self.gotoAddress(self.initAddress) + while True: + transcript = self.getNextTranscript() + if not transcript: + self.over = True + return + yield transcript + + + def setInitAddress(self, address): + self.initAddress = address + + + def getCurrentTranscriptAddress(self): + return self.address + + + def isOver(self): + return self.over |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/PslParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/PslParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,155 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class PslParser(MapperParser): + """A class that parses the output of PSL format (of SSAHA and BLAT)""" + + def __init__(self, fileName, verbosity = 0): + super(PslParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(PslParser, self).__del__() + + + def getFileFormats(): + return ["psl"] + getFileFormats = staticmethod(getFileFormats) + + + def getInfos(self): + self.chromosomes = set() + self.nbMappings = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting info on PSL file, # mappings read:", self.verbosity) + for line in self.handle: + progress.inc() + line = line.strip() + if line == "": + continue + parts = line.split("\t") + chromosome = parts[13] + self.chromosomes.add(chromosome) + self.nbMappings += 1 + self.size += len(parts[0]) + self.reset() + progress.done() + + + def skipFirstLines(self): + while "------" not in self.handle.readline(): + self.currentLineNb += 1 + pass + + def _computeStarts(self,seqSize,blockSize,start,targetStrand): + if targetStrand == "+": + pass + else: + start = seqSize-blockSize-start + return start + + + + def parseLine(self, line): + m = re.search(r"^\s*(psl:\s+)?(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+(\d+)\s+([+-]{1,2})\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s*$", line) + if m == None: + raise Exception("\nLine %d '%s' does not have a PSL format" % (self.currentLineNb, line)) + + mapping = Mapping() + + queryStrand = m.group(7)[0] + + if len(m.group(7)) == 1: + targetStrand = "+" + else: + targetStrand = m.group(7)[1] + + + for i in range(0, int(m.group(16))): + size = int(m.group(17).split(",")[i]) + queryStart = int(m.group(18).split(",")[i]) + targetStart = int(m.group(19).split(",")[i]) + querySize = int(m.group(9)) + targetSize = int(m.group(13)) + + subMapping = SubMapping() + subMapping.setSize(size) + subMapping.setDirection(m.group(7)[0]) + + queryInterval = Interval() + targetInterval = Interval() + + queryInterval.setName(m.group(8)) + queryStart = self._computeStarts(querySize,size,queryStart,targetStrand) + queryInterval.setStart(queryStart + 1) + queryInterval.setEnd(queryStart + size) + queryInterval.setDirection(queryStrand) + + targetInterval.setChromosome(m.group(12)) + targetStart = self._computeStarts(targetSize,size,targetStart,targetStrand) + targetInterval.setStart(targetStart + 1) + targetInterval.setEnd(targetStart + size) + targetInterval.setDirection(targetStrand) + + subMapping.setQueryInterval(queryInterval) + subMapping.setTargetInterval(targetInterval) + mapping.addSubMapping(subMapping) + + mapping.setSize(int(m.group(2)) + int(m.group(3)) + int(m.group(4))) + mapping.setNbMismatches(int(m.group(3)) + int(m.group(4))) + mapping.setNbGaps(int(m.group(5))) + mapping.setDirection(queryStrand) + + queryInterval = Interval() + targetInterval = Interval() + + queryInterval.setName(m.group(8)) + queryInterval.setStart(min(int(m.group(10)), int(m.group(11)))) + queryInterval.setEnd( max(int(m.group(10)), int(m.group(11)))) + queryInterval.setDirection(queryStrand) + + targetInterval.setChromosome(m.group(12)) + targetInterval.setStart(min(int(m.group(14))+1, int(m.group(15)))) + targetInterval.setEnd( max(int(m.group(14))+1, int(m.group(15)))) + targetInterval.setDirection(targetStrand) + + mapping.setQueryInterval(queryInterval) + mapping.setTargetInterval(targetInterval) + + return mapping + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/README_MultiFasta2SNPFile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/README_MultiFasta2SNPFile Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,66 @@ +*** DESCRIPTION: *** +This program takes as input a multifasta file (with sequences already aligned together formated in fasta in the same file), considers the first sequence as the reference sequence, infers polymorphims and generates output files in GnpSNP exchange format. + + +*** INSTALLATION: *** +Dependancies: +- First you need Python installed in your system. +- Repet libraries are also required. + +*** OPTIONS OF THE LAUNCHER: *** + + -h: this help + +Mandatory options: + -b: Name of the batch of submitted sequences + -g: Name of the gene + -t: Scientific name of the taxon concerned + +Exclusive options (use either the first or the second) + -f: Name of the multifasta input file (for one input file) + -d: Name of the directory containing multifasta input file(s) (for several input files) + + + +*** COMMAND LINE EXAMPLE (for package use): *** +- First, you need to set up the environment variable PYTHONPATH (lo link with the dependancies). + +- Then for one input file (here our example), run: + +python multifastaParserLauncher.py -b Batch_test -g GeneX -t "Arabidopsis thaliana" -f Exemple_multifasta_input.fasta + + +- For several input files, create a directory in the root of the uncompressed package and put your input files in it. Then use this type of command line: + +python multifastaParserLauncher.py -b Batch_test -g GeneX -t "Arabidopsis thaliana" -d <Name_of_the_directory> + +Each one of the input files will generate a directory with his set of output files. + + +*** SIMPLE USE (for package use): *** +Two executables (one for windows, the other for linux/unix) are in the package. +They show the command lines to use in order to set up environment variables and then to run the parser on our sample input file (Example_multifasta_input.fasta). +You can edit the executable and custom the command line to use it with your own input file. + + +*** BACKLOG (next version) *** +When the launcher is called for several input files (with -d option), the parser should be able to generate only one set of files describing all the batches (one batch per input file). +So below are listed the tasks of the backlog dedicated to this feature: + +- in Multifasta2SNPFile class: + # CONSTRUCTOR: Modify the constructor to add a "several batches" mode called without BatchName and GeneName + # RUNNING METHOD: Add the run_several_batches(directory) method that will browse the input files and iterate over them to run each of them successively (see runSeveralInputFile() method of the launcher) + => 2 days + + # BATCH MANAGEMENT: Modify createBatchDict() to create one batch per file in the dictionary and add a class variable to point toward the current batch (ex: self._iCurrentLineNumber) + # BATCH-LINE MANAGEMENT: Modify _completeBatchLineListWithCurrentIndividual method to allow several batch and link lines to batches (for the moment hard coded batch no1) + # SUBSNP MANAGEMENT: check that all elements (dSUbSNP) added in SubSNP list (lSubSNPFileResults) is linked to the current batch (for the moment hard coded batch no1) + Impacted methods: manageSNPs(), createSubSNPFromAMissingPolym(), addMissingAllelesAndSubSNPsForOnePolym(), mergeAllelesAndSubSNPsFromOverlappingIndels() + => + 2 days + +- in Multifasta2SNPFileWriter class: + # Modify all the method _write<X>File (ex: _writeSubSNPFile) to write in append mode and externalize all open and close file + # Create one method to open all the output files and call it in Multifasta2SNPFile run_several_batches method + # Create one method to close all the output files and call it in Multifasta2SNPFile run_several_batches method + + => + 2 days |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/RmapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/RmapParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,76 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping + +class RmapParser(MapperParser): + """A class that parses the output of Rmap format""" + + def __init__(self, fileName, verbosity = 0): + super(RmapParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(RmapParser, self).__del__() + + + def getFileFormats(): + return ["rmap"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+([+-])\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a RMAP format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.targetInterval.setChromosome(m.group(1)) + mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) + mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) + + mapping.queryInterval.setName(m.group(4)) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(mapping.targetInterval.getEnd() - mapping.targetInterval.getStart()) + + mapping.setSize(mapping.targetInterval.getEnd() - mapping.targetInterval.getStart()) + mapping.setNbMismatches(int(m.group(5))) + mapping.setDirection(m.group(6)) + + return mapping + + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/SamParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SamParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,234 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import re\n+import sys\n+from commons.core.parsing.MapperParser import MapperParser\n+from SMART.Java.Python.structure.Mapping import Mapping\n+from SMART.Java.Python.structure.SubMapping import SubMapping\n+from SMART.Java.Python.structure.Interval import Interval\n+\n+class SamParser(MapperParser):\n+ """A class that parses SAM format (as given by BWA)"""\n+\n+ def __init__(self, fileName, verbosity = 0):\n+ super(SamParser, self).__init__(fileName, verbosity)\n+\n+\n+ def __del__(self):\n+ super(SamParser, self).__del__()\n+\n+\n+ def getFileFormats():\n+ return ["sam"]\n+ getFileFormats = staticmethod(getFileFormats)\n+\n+\n+ def skipFirstLines(self):\n+ pass\n+\n+\n+ def getInfos(self):\n+ self.chromosomes = set()\n+ self.nbMappings = 0\n+ self.size = 0\n+ self.reset()\n+ if self.verbosity >= 10:\n+ print "Getting information on SAM file"\n+ self.reset()\n+ for line in self.handle:\n+ line = line.strip()\n+ if line == "" or line[0] == "@":\n+ continue\n+ parts = line.split("\\t")\n+ chromosome = parts[2]\n+ if chromosome != "*":\n+ self.chromosomes.add(chromosome)\n+ self.nbMappings += 1\n+ self.size += len(parts[8])\n+ if self.verbosity >= 10 and self.nbMappings % 100000 == 0:\n+ sys.stdout.write(" %d mappings read\\r" % (self.nbMappings))\n+ sys.stdout.flush()\n+ self.reset()\n+ if self.verbosity >= 10:\n+ print " %d mappings read" % (self.nbMappings)\n+ print "Done."\n+\n+\n+ def parseLine(self, line):\n+\n+ line = line.strip()\n+ if line[0] == "@":\n+ return\n+\n+ fields = line.split("\\t")\n+ if len(fields) < 11:\n+ raise Exception("Line %d \'%s\' does not look like a SAM line (number of fields is %d instead of 11)" % (self.currentLineNb, line, len(fields)))\n+\n+ name = fields[0]\n+ flag = int(fields[1])\n+\n+ if (flag & 0x4) == 0x4:\n+ return None\n+\n+ direction = 1 if (flag & 0x10) == 0x0 else -1\n+ chromosome = fields[2]\n+ genomeStart = int(fields[3])\n+ quality = fields[4]\n+ cigar = fields[5]\n+ mate = fields[6]\n+ mateGenomeStart = fields[7]\n+ gapSize = fields[8]\n+ sequence = fields[9]'..b'e:\n+ currentNumber = currentNumber * 10 + (ord(char) - ord("0"))\n+ continue\n+ # match\n+ m = re.match(r"[M]", char)\n+ if m != None:\n+ if readStart == None:\n+ readStart = queryOffset\n+ if subMapping == None:\n+ subMapping = SubMapping()\n+ subMapping.setSize(currentNumber)\n+ subMapping.setDirection(direction)\n+ subMapping.queryInterval.setName(name)\n+ subMapping.queryInterval.setStart(queryOffset)\n+ subMapping.queryInterval.setDirection(direction)\n+ subMapping.targetInterval.setChromosome(chromosome)\n+ subMapping.targetInterval.setStart(genomeStart + targetOffset)\n+ subMapping.targetInterval.setDirection(1)\n+ nbMatches += currentNumber\n+ targetOffset += currentNumber\n+ queryOffset += currentNumber\n+ currentNumber = 0\n+ continue\n+ # insertion on the read\n+ m = re.match(r"[I]", char)\n+ if m != None:\n+ nbGaps += 1\n+ queryOffset += currentNumber\n+ currentNumber = 0\n+ continue\n+ # insertion on the genome\n+ m = re.match(r"[D]", char)\n+ if m != None:\n+ if subMapping != None:\n+ subMapping.queryInterval.setEnd(queryOffset - 1)\n+ subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+ mapping.addSubMapping(subMapping)\n+ subMapping = None\n+ nbGaps += 1\n+ targetOffset += currentNumber\n+ currentNumber = 0\n+ continue\n+ # intron\n+ m = re.match(r"[N]", char)\n+ if m != None:\n+ if subMapping != None:\n+ subMapping.queryInterval.setEnd(queryOffset - 1)\n+ subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+ mapping.addSubMapping(subMapping)\n+ subMapping = None\n+ targetOffset += currentNumber\n+ currentNumber = 0\n+ continue\n+ # soft clipping (substitution)\n+ m = re.match(r"[S]", char)\n+ if m != None:\n+ nbMismatches += currentNumber\n+ targetOffset += currentNumber\n+ queryOffset += currentNumber\n+ currentNumber = 0\n+ continue\n+ # hard clipping\n+ m = re.match(r"[H]", char)\n+ if m != None:\n+ targetOffset += currentNumber\n+ queryOffset += currentNumber\n+ currentNumber = 0\n+ continue\n+ # padding\n+ m = re.match(r"[P]", char)\n+ if m != None:\n+ continue\n+ raise Exception("Do not understand paramer \'%s\' in line %s" % (char, line))\n+\n+ if subMapping != None:\n+ subMapping.queryInterval.setEnd(queryOffset - 1)\n+ subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+ mapping.addSubMapping(subMapping)\n+\n+ mapping.queryInterval.setStart(readStart)\n+ mapping.queryInterval.setEnd(queryOffset - 1)\n+ mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)\n+ mapping.setNbMismatches(nbMismatches)\n+ mapping.setNbGaps(nbGaps)\n+\n+ mapping.queryInterval.setName(name)\n+ mapping.queryInterval.setDirection(direction)\n+ mapping.targetInterval.setChromosome(chromosome)\n+ mapping.targetInterval.setStart(genomeStart)\n+ mapping.targetInterval.setDirection(direction)\n+ mapping.setSize(len(sequence))\n+ mapping.setDirection(direction)\n+\n+ return mapping\n+\n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/SeqmapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SeqmapParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,81 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from commons.core.parsing.MapperParser import MapperParser +from SMART.Java.Python.structure.Mapping import Mapping + + +class SeqmapParser(MapperParser): + """A class that parses the output of SeqMap""" + + def __init__(self, fileName, verbosity = 0): + super(SeqmapParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(SeqmapParser, self).__del__() + + + def getFileFormats(): + return ["seqmap"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + self.startingPoint = self.handle.tell() + self.currentLineNb += 1 + if "trans_id" not in self.handle.readline(): + self.currentLineNb -= 1 + self.handle.seek(self.startingPoint) + self.startingPoint = self.handle.tell() + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\w+)\t+([^\t]+)\t+(\w+)\t+(\d+)\t+([+-])\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a SeqMap format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.targetInterval.setChromosome(m.group(1)) + mapping.targetInterval.setStart(int(m.group(2))) + mapping.targetInterval.setSize(len(m.group(3))) + + mapping.queryInterval.setName(m.group(4)) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(len(m.group(3))) + + mapping.setSize(len(m.group(3))) + mapping.setNbMismatches(int(m.group(6))) + mapping.setDirection(m.group(7)) + + return mapping + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/SequenceListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SequenceListParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,228 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.SequenceList import SequenceList +from SMART.Java.Python.misc.Progress import Progress + +class SequenceListParser(object): + """ + A virtual class that reads a list of sequences + @ivar verbosity: verbosity + @type verbosity: int + @ivar fileName: name of the file to parse + @type fileName: string + @ivar handle: file to parse + @type handle: file + @ivar nbSequences: number of sequences in the file + @type nbSequences: int + @ivar nbReadSequences: number of sequences read + @type nbReadSequences: int + @ivar currentLine: line currently read + @type currentLine: string + @ivar size: total number of nucleotides in the sequences + @type size: int + @ivar sizes: number of nucleotides per sequences + @type sizes: dict of string to int + """ + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + @param fileName: name of the file to parse + @type fileName: string + """ + self.verbosity = verbosity + self.fileName = fileName + self.nbSequences = None + self.nbReadSequences = 0 + self.currentLine = None + self.size = None + self.sizes = None + try: + self.handle = open(self.fileName, "rb") + except IOError: + raise Exception("Error! Sequence file '%s' does not exist! Exiting..." % (self.fileName)) + + + def __del__(self): + """ + Destructor + """ + if not self.handle.closed: + self.handle.close() + + + def close(self): + """ + Close file handle + """ + self.handle.close() + + + def reset(self): + """ + Prepare the file to be read again from start + """ + self.handle.seek(0) + self.currentLine = None + self.nbReadSequences = 0 + + + def getFileFormats(self): + pass + getFileFormats = staticmethod(getFileFormats) + + + def parse(self): + """ + Parse the whole file in one shot + @return: a list of sequence + """ + sequenceList = SequenceList() + progress = Progress(self.getNbSequences(), "Reading %s" % (self.fileName), self.verbosity) + for sequence in self.getIterator(): + sequenceList.addSequence(sequence) + progress.inc() + progress.done() + return sequenceList + + + def getIterator(self): + """ + Iterate on the file, sequence by sequence + @return: an iterator to sequences + """ + self.reset() + sequence = self.parseOne() + while sequence != None: + self.nbReadSequences += 1 + yield sequence + sequence = self.parseOne() + + + def getInfos(self): + """ + Get some generic information about the sequences + """ + self.nbSequences = 0 + self.size = 0 + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + for sequence in self.getIterator(): + self.nbSequences += 1 + self.size += sequence.getSize() + if self.verbosity >= 10 and self.nbSequences % 100000 == 0: + sys.stdout.write(" %d sequences read\r" % (self.nbSequences)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d sequences read" % (self.nbSequences) + print "Done." + + + def getNbSequences(self): + """ + Get the number of sequences in the file + @return: the number of sequences + """ + if self.nbSequences != None: + return self.nbSequences + self.getInfos() + return self.nbSequences + + + def getNbItems(self): + """ + Get the number of sequences in the file + @return: the number of sequences + """ + return self.getNbSequences() + + + def getSize(self): + """ + Get the size of all the sequences + @return: the size + """ + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getRegions(self): + """ + Get the names of the sequences + @return: the names + """ + if self.sizes != None: + return self.sizes.keys() + + self.sizes = {} + self.reset() + if self.verbosity >= 10: + print "Getting information on %s." % (self.fileName) + self.nbSequences = 0 + for sequence in self.getIterator(): + self.sizes[sequence.name] = sequence.getSize() + self.nbSequences += 1 + if self.verbosity >= 10 and self.nbSequences % 100000 == 0: + sys.stdout.write(" %d sequences read\r" % (self.nbSequences)) + sys.stdout.flush() + self.reset() + if self.verbosity >= 10: + print " %d sequences read" % (self.nbSequences) + print "Done." + return self.sizes.keys() + + + def getSizeOfRegion(self, region): + """ + Get the size of a sequence + @param region: the name of the sequence + @type region: string + @return: the size of the sequence + """ + if self.sizes != None: + if region not in self.sizes: + raise Exception("Region %s is not found" % region) + return self.sizes[region] + + self.getRegions() + if region not in self.sizes: + raise Exception("Region %s is not found" % region) + + def __eq__(self, o): + if o == None: + return False + return self.fileName == o.fileName and self.nbSequences == o.nbSequences |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/ShrimpParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/ShrimpParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,107 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser + + +class ShrimpParser(MapperParser): + """A class that parses the output of Shrimp""" + + def __init__(self, fileName, verbosity = 0): + super(ShrimpParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(ShrimpParser, self).__del__() + + + def getFileFormats(): + return ["shrimp"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + self.handle.readline() + self.currentLineNb += 1 + + + def parseLine(self, line): + m = re.search(r"^\s*>([^\t]+)\t+(\S+)\s+([+-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s*$", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a Shrimp format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.queryInterval.setName(m.group(1)) + mapping.queryInterval.setStart(min(int(m.group(6)), int(m.group(7)))) + mapping.queryInterval.setEnd(max(int(m.group(6)), int(m.group(7)))) + + mapping.targetInterval.setChromosome(m.group(2)) + mapping.targetInterval.setStart(min(int(m.group(4)), int(m.group(5)))) + mapping.targetInterval.setEnd(max(int(m.group(4)), int(m.group(5)))) + + mapping.setSize(int(m.group(8))) + mapping.setDirection(m.group(3)) + + editString = m.group(10) + nbMismatches = 0 + nbGaps = 0 + while editString != "": + m = re.search(r"^(\d+)(\D.*)$", editString) + if m != None: + editString = m.group(2) + else: + m = re.search(r"^(\d+)$", editString) + if m != None: + editString = "" + else: + m = re.search(r"^([A-Z])(.*)$", editString) + if m != None: + nbMismatches += 1 + editString = m.group(2) + else: + m = re.search(r"^\((\w+)\)(.*)$", editString) + if m != None: + nbGaps += len(m.group(1)) + editString = m.group(2) + else: + m = re.search(r"^-(.*)$", editString) + if m != None: + nbGaps += 1 + editString = m.group(1) + else: + sys.exit("Cannot understand edit string %s from line %s" % (editString, line)) + + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(nbGaps) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/Soap2Parser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/Soap2Parser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,148 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.structure.SubMapping import SubMapping +from commons.core.parsing.MapperParser import MapperParser + + +def mappingToSubMapping(mapping): + subMapping = SubMapping() + subMapping.targetInterval.copy(mapping.targetInterval) + subMapping.queryInterval.copy(mapping.queryInterval) + subMapping.setDirection(mapping.getDirection()) + subMapping.size = mapping.size + subMapping.tags = mapping.tags + return subMapping + + + +class Soap2Parser(MapperParser): + """A class that parses the output of SOAP2""" + + def __init__(self, fileName, verbosity = 0): + super(Soap2Parser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(Soap2Parser, self).__del__() + + + def getFileFormats(): + return ["soap2"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def getIterator(self): + self.reset() + currentName = None + currentMappings = [] + for line in self.handle: + mapping = self.parseLine(line) + name = mapping.queryInterval.name + if name == currentName: + if mapping.getTagValue("end") == "a": + currentMappings.append(mapping) + else: + otherEndMapping = currentMappings.pop(0) + + newMapping = Mapping() + subMappingA = mappingToSubMapping(otherEndMapping) + subMappingB = mappingToSubMapping(mapping) + subMappingB.queryInterval.setDirection(subMappingA.queryInterval.getDirection()) + + newMapping.addSubMapping(subMappingA) + newMapping.addSubMapping(subMappingB) + + newMapping.tags = otherEndMapping.tags + newMapping.setSize(otherEndMapping.size + mapping.size) + newMapping.setNbMismatches(otherEndMapping.getTagValue("nbMismatches") + mapping.getTagValue("nbMismatches")) + print otherEndMapping.getTagValue("nbMismatches") + print mapping.getTagValue("nbMismatches") + print newMapping.getTagValue("nbMismatches") + sys.exit() + newMapping.setTagValue("qualityString", otherEndMapping.getTagValue("qualityString") + mapping.getTagValue("qualityString")) + newMapping.setTagValue("occurrence", "%d" % (newMapping.getTagValue("nbOccurrences") - len(currentMappings))) + newMapping.setTagValue("ID", "%s-%s" % (name, newMapping.getTagValue("occurrence"))) + del newMapping.tags["end"] + yield newMapping + else: + currentName = mapping.queryInterval.name + for currentMapping in currentMappings: + yield currentMapping + currentMappings = [mapping] + self.currentLineNb += 1 + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\w+)\s+(\S+)\s+(\d+)\s+([ab])\s+(\d+)\s+([+-])\s+(\w+)\s+(\d+)\s+(\d+)\s+", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a SOAP2 format" % (self.currentLineNb, line)) + + name = m.group(1) + read = m.group(2) + qualityString = m.group(3) + nbOccurrences = int(m.group(4)) + end = m.group(5) + size = int(m.group(6)) + direction = m.group(7) + chromosome = m.group(8) + genomeStart = int(m.group(9)) + nbMismatches = int(m.group(10)) + + mapping = Mapping() + if name.endswith("/1") or name.endswith("/2"): + name = name[:-2] + + mapping.queryInterval.name = name + mapping.queryInterval.setDirection(direction) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setEnd(size) + + mapping.targetInterval.setChromosome(chromosome) + mapping.targetInterval.setStart(genomeStart) + mapping.targetInterval.setSize(size) + + mapping.setDirection(direction) + mapping.setSize(size) + + mapping.setNbMismatches(nbMismatches) + mapping.setNbGaps(0) + mapping.setTagValue("qualityString", qualityString) + mapping.setTagValue("nbOccurrences", nbOccurrences) + mapping.setTagValue("end", end) + + return mapping + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/SoapParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SoapParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,75 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import re +import sys +from SMART.Java.Python.structure.Mapping import Mapping +from commons.core.parsing.MapperParser import MapperParser + + +class SoapParser(MapperParser): + """A class that parses the output of SOAP""" + + def __init__(self, fileName, verbosity = 0): + super(SoapParser, self).__init__(fileName, verbosity) + + + def __del__(self): + super(SoapParser, self).__del__() + + + def getFileFormats(): + return ["soap"] + getFileFormats = staticmethod(getFileFormats) + + + def skipFirstLines(self): + pass + + + def parseLine(self, line): + m = re.search(r"^\s*(\S+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(a)\s+(\d+)\s+([+-])\s+(\w+)\s+(\d+)\s+(\d+)", line) + if m == None: + sys.exit("\nLine %d '%s' does not have a SOAP format" % (self.currentLineNb, line)) + + mapping = Mapping() + + mapping.queryInterval.setName(m.group(1)) + mapping.queryInterval.setStart(1) + mapping.queryInterval.setSize(len(m.group(2))) + + mapping.targetInterval.setChromosome(m.group(8)) + mapping.targetInterval.setStart(int(m.group(9))) + mapping.targetInterval.setSize(len(m.group(2))) + + mapping.setDirection(m.group(7)) + mapping.setSize(len(m.group(2))) + mapping.setNbMismatches(int(m.group(10))) + + return mapping |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/SsrParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/SsrParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,170 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import sys + +## this class can parse a Ssr results output file. SSR.pl is developped by S.Cartinhour. (5/2000) +# +class SsrParser(object): + + + def __init__(self, BES_name='', BES_redundancy='', SSR_nbNucleotides='', SSR_Motif='', SSR_Motif_number='', SSR_start='', SSR_end='', BES_size=''): + self._BesName = BES_name + self._BesRedundancy = BES_redundancy + self._SsrNbNucleotides = SSR_nbNucleotides + self._SsrMotif = SSR_Motif + self._SsrMotifNumber = SSR_Motif_number + self._SsrStart = SSR_start + self._SsrEnd = SSR_end + self._BesSize = BES_size + + def __eq__(self, o): + return self._BesName == o._BesName and self._BesRedundancy == o._BesRedundancy and self._SsrNbNucleotides == o._SsrNbNucleotides and self._SsrMotif == o._SsrMotif and self._SsrMotifNumber == o._SsrMotifNumber and self._SsrStart == o._SsrStart and self._SsrEnd == o._SsrEnd and self._BesSize == o._BesSize + + def setBesName(self, BES_Name): + self._BesName = BES_Name + + def setBesRedundancy(self, BES_redundancy): + self._BesRedundancy = BES_redundancy + + def setSsrNbNucleotides(self, SSR_nbNucleotides): + self._SsrNbNucleotides = SSR_nbNucleotides + + def setSsrMotif(self, SSR_Motif): + self._SsrMotif = SSR_Motif + + def setSsrMotifNumber(self, SSR_Motif_number): + self._SsrMotifNumber = SSR_Motif_number + + def setSsrStart(self, SSR_start): + self._SsrStart = SSR_start + + def setSsrEnd(self, SSR_end): + self._SsrEnd = SSR_end + + def setBesSize(self, BES_size): + self._BesSize = BES_size + + def getBesName(self): + return self._BesName + + def getBesRedundancy(self): + return self._BesRedundancy + + def getSsrNbNucleotides(self): + return self._SsrNbNucleotides + + def getSsrMotif(self): + return self._SsrMotif + + def getSsrMotifNumber(self): + return self._SsrMotifNumber + + def getSsrStart(self): + return self._SsrStart + + def getSsrEnd(self): + return self._SsrEnd + + def getBesSize(self): + return self._BesSize + + def setAttributes(self, lResults, iCurrentLineNumber): + error = False + + if lResults[0] != '': + self.setBesName(lResults[0]) + else: + sys.stderr.write("WARNING: The field BES Name is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[1] != '': + self.setBesRedundancy(lResults[1]) + else: + sys.stderr.write("WARNING: The field BES Redundancy is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[2] != '': + self.setSsrNbNucleotides(lResults[2]) + else: + sys.stderr.write("WARNING: The field SSR Number Nucleotides is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[3] != '': + self.setSsrMotif(lResults[3]) + else: + sys.stderr.write("WARNING: The field SSR Motif is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[4] != '': + self.setSsrMotifNumber(lResults[4]) + else: + sys.stderr.write("WARNING: The field SSR Motif Number is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[5] != '': + self.setSsrStart(lResults[5]) + else: + sys.stderr.write("WARNING: The field SSR Start is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[6] != '': + self.setSsrEnd(lResults[6]) + else: + sys.stderr.write("WARNING: The field SSR End is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if lResults[7] != '': + self.setBesSize(lResults[7]) + else: + sys.stderr.write("WARNING: The field BES Size is empty in SSR results file in line %s\n" % iCurrentLineNumber) + error = True + + if error == True: + self._setAllToNull() + + def setAttributesFromString(self, ssrLine, iCurrentLineNumber ="", fieldSeparator ="\t"): + ssrLine = ssrLine.rstrip() + lSsrLineItem = ssrLine.split(fieldSeparator) + if len(lSsrLineItem) < 8: + sys.stderr.write("WARNING: The line %s is not a valid SSR Result line\n" % iCurrentLineNumber) + else: + self.setAttributes(lSsrLineItem, iCurrentLineNumber) + + def _setAllToNull(self): + self._BesName = '' + self._BesRedundancy = '' + self._SsrNbNucleotides = '' + self._SsrMotif = '' + self._SsrMotifNumber = '' + self._SsrStart = '' + self._SsrEnd = '' + self._BesSize = '' + \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/TranscriptListParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/TranscriptListParser.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,182 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +class TranscriptListParser(object): + """A (quite generic) class that reads a list of transcripts""" + + def __init__(self, fileName, verbosity = 0): + self.verbosity = verbosity + self.fileName = fileName + self.nbTranscripts = None + self.size = None + self.chromosomes = None + self.currentTranscript = None + self.currentLineNb = 0 + self.previousTranscriptAddress = None + try: + self.handle = open(self.fileName) + except IOError: + raise Exception("Error! Transcript file '%s' does not exist! Exiting..." % (self.fileName)) + self.skipFirstLines() + + + def __del__(self): + self.close() + + + def getFileFormats(): + pass + getFileFormats = staticmethod(getFileFormats) + + + def close(self): + if self.handle != None and not self.handle.close: + self.handle.close() + self.handle = None + + + def reset(self): + self.handle.seek(0) + self.skipFirstLines() + self.currentTranscript = None + self.currentLineNb = 0 + self.currentTranscriptAddress = self.handle.tell() + self.currentAddress = self.handle.tell() + + + def gotoAddress(self, address): + self.reset() + self.handle.seek(address) + self.currentTranscriptAddress = address + self.currentAddress = address + + + def parse(self): + transcriptList = TranscriptList() + progress = Progress(self.getNbTranscripts(), "Reading %s" % (self.fileName), self.verbosity) + for line in self.handle: + self.currentLineNb += 1 + transcript = self.parseLine(line) + transcriptList.addTranscript(transcript) + progress.inc() + progress.done() + return transcriptList + + + def getIterator(self): + self.reset() + transcript = self.getNextTranscript() + while transcript != None: + yield transcript + transcript = self.getNextTranscript() + + + def getCurrentAddress(self): + return self.currentAddress + + + def getCurrentTranscriptAddress(self): + return self.currentTranscriptAddress + + + def getNextTranscript(self): + self.currentAddress = self.handle.tell() + line = self.handle.readline() + while line != "": + line = line.strip() + self.currentLineNb += 1 + transcript = self.parseLine(line) + if transcript != None: + return transcript + self.currentAddress = self.handle.tell() + line = self.handle.readline() + transcript = self.currentTranscript + self.currentTranscriptAddress = self.previousTranscriptAddress + self.currentTranscript = None + return transcript + + + def getInfos(self): + self.chromosomes = set() + self.nbTranscripts = 0 + self.size = 0 + self.reset() + progress = UnlimitedProgress(100000, "Getting information on %s." % (self.fileName), self.verbosity-9) + transcript = self.getNextTranscript() + for transcript in self.getIterator(): + self.chromosomes.add(transcript.getChromosome()) + self.nbTranscripts += 1 + self.size += transcript.getSize() + progress.inc() + progress.done() + self.reset() + + + def getNbTranscripts(self): + if self.nbTranscripts != None: + return self.nbTranscripts + self.getInfos() + return self.nbTranscripts + + + def getNbItems(self): + return self.getNbTranscripts() + + + def getChromosomes(self): + if self.chromosomes != None: + return self.chromosomes + self.getInfos() + return self.chromosomes + + + def getSize(self): + if self.size != None: + return self.size + self.getInfos() + return self.size + + + def getNbNucleotides(self): + return self.getSize() + + + def setDefaultTagValue(self, name, value): + for transcript in self.getIterator(): + transcript.setTag(name, value) + + def __eq__(self, o): + if o == None: + return False + return self.fileName == o.fileName and self.nbTranscripts == o.nbTranscripts and self.size == o.size and self.chromosomes == o.chromosomes |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanFile.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,145 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.parsing.VarscanHit import VarscanHit +from commons.core.parsing.VarscanHit_WithTag import VarscanHit_WithTag +from commons.core.parsing.VarscanHit_v2_2_8 import VarscanHit_v2_2_8 +from commons.core.checker.CheckerException import CheckerException +from commons.core.parsing.VarscanHit_v2_2_8_WithTag import VarscanHit_v2_2_8_WithTag + +class VarscanFile(object): + + def __init__(self, varscanFileName = ""): + self._varscanFileName = varscanFileName + self._varscanFieldSeparator = "\t" + self._lVarscanHits = [] + self._typeOfVarscanFile = "" + + def __eq__(self, o): + return self._varscanFieldSeparator == o._varscanFieldSeparator and self._lVarscanHits == o._lVarscanHits and self._varscanFileName == o._varscanFileName + + def setVarscanHitsList(self, lVarscanHits): + self._lVarscanHits = lVarscanHits + + def setHeaderVarcanFile(self, headerVarcanFile): + self._headerVarcanFile = headerVarcanFile + + def setTypeOfVarscanFile(self, type): + if type == "Varscan_2_2" or type == "Varscan_2_2_WithTag" or type == "Varscan_2_2_8" or type == "Varscan_2_2_8_WithTag": + self._typeOfVarscanFile = type + else: + self._typeOfVarscanFile = "" + + def getVarscanHitsList(self): + return self._lVarscanHits + + def getHeaderVarcanFile(self): + return self._headerVarcanFile + + def getListOfVarscanHits(self): + return self._lVarscanHits + + def getTypeOfVarscanFile(self): + return self._typeOfVarscanFile + + def parse(self): + varscanFile = open(self._varscanFileName, "r") + currentLineNumber = 0 + line = varscanFile.readline() + if "Chrom\tPosition" in line: + self.setHeaderVarcanFile(line) + line = varscanFile.readline() + while line != "": + if not "Chrom\tPosition" in line: + currentLineNumber += 1 + line = line.strip() + lResults = line.split(self._varscanFieldSeparator) + if len(lResults) == 12: + currentVarscanLine = self.createVarscanHit(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2" + elif len(lResults) == 13: + currentVarscanLine = self.createVarscanHitWithTag(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2_WithTag" + elif len(lResults) == 19: + currentVarscanLine = self.createVarscanHit_v2_2_8(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2_8" + elif len(lResults) == 20: + currentVarscanLine = self.createVarscanHit_v2_2_8_WithTag(line, currentLineNumber) + self._typeOfVarscanFile = "Varscan_2_2_8_WithTag" + else: + raise CheckerException ("Warning: this line (l.%s) is not a valid varscan line !" % currentLineNumber) + self._lVarscanHits.append(currentVarscanLine) + line = varscanFile.readline() + varscanFile.close() + + def createVarscanObjectFromLine(self, line, currentLineNumber): + if self._typeOfVarscanFile == "Varscan_2_2": + VarscanHit = self.createVarscanHit(line, currentLineNumber) + return VarscanHit + elif self._typeOfVarscanFile == "Varscan_2_2_WithTag": + return self.createVarscanHitWithTag(line, currentLineNumber) + elif self._typeOfVarscanFile == "Varscan_2_2_8": + return self.createVarscanHit_v2_2_8(line, currentLineNumber) + elif self._typeOfVarscanFile == "Varscan_2_2_8_WithTag": + return self.createVarscanHit_v2_2_8_WithTag(line, currentLineNumber) + + def createVarscanHit(self, line, currentLineNumber): + iVarscanHit = VarscanHit() + iVarscanHit.setAttributesFromString(line, currentLineNumber) + return iVarscanHit + + def createVarscanHitWithTag(self, line, currentLineNumber): + iVarscanHitWithTag = VarscanHit_WithTag() + iVarscanHitWithTag.setAttributesFromString(line, currentLineNumber) + return iVarscanHitWithTag + + def createVarscanHit_v2_2_8(self, line, currentLineNumber): + iVarscanHit = VarscanHit_v2_2_8() + iVarscanHit.setAttributesFromString(line, currentLineNumber) + return iVarscanHit + + def createVarscanHit_v2_2_8_WithTag(self, line, currentLineNumber): + iVarscanHitWithTag = VarscanHit_v2_2_8_WithTag() + iVarscanHitWithTag.setAttributesFromString(line, currentLineNumber) + return iVarscanHitWithTag + + def selectTypeOfVarscanHitObject(self): + if self._typeOfVarscanFile == "": + raise CheckerException ("Error: no varscan object found !") + elif self._typeOfVarscanFile == "Varscan_2_2": + return VarscanHit() + elif self._typeOfVarscanFile == "Varscan_2_2_WithTag": + return VarscanHit_WithTag() + elif self._typeOfVarscanFile == "Varscan_2_2_8": + return VarscanHit_v2_2_8() + elif self._typeOfVarscanFile == "Varscan_2_2_8_WithTag": + return VarscanHit_v2_2_8_WithTag() + \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanFileForGnpSNP.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanFileForGnpSNP.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,72 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.parsing.VarscanHitForGnpSNP import VarscanHitForGnpSNP +from commons.core.parsing.VarscanFile import VarscanFile + +class VarscanFileForGnpSNP(VarscanFile): + + def __init__(self, varscanFileName, fastqFileName="", refFastaFileName="", taxonName=""): + VarscanFile.__init__(self, varscanFileName) + self._fastqFileName = fastqFileName + self._refFastaFileName = refFastaFileName + self._taxonName = taxonName + self._previousVarscanHit = None + + ## Equal operator + # + # @param o a VarscanFileAnalysis instance + # + def __eq__(self, o): + return VarscanFile.__eq__(self, o) and self._fastqFileName == o._fastqFileName \ + and self._refFastaFileName == o._refFastaFileName and self._taxonName == o._taxonName + + def getVarscanFieldSeparator(self): + return self._varscanFieldSeparator + + def getFastqFileName(self): + return self._fastqFileName + + def getRefFastaFileName(self): + return self._refFastaFileName + + def getTaxonName(self): + return self._taxonName + + def createVarscanHit(self, line, currentLineNumber): + line = line.strip() + lResults = line.split(self._varscanFieldSeparator) + iVarscanHit = VarscanHitForGnpSNP() + iVarscanHit.setAttributes(lResults, currentLineNumber) + iVarscanHit.formatAlleles2GnpSnp() + iVarscanHit.manageOccurrence(self._previousVarscanHit) + self._previousVarscanHit = iVarscanHit + return iVarscanHit |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanHit.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,175 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.CheckerException import CheckerException + +class VarscanHit(object): + + def __init__(self, chrom = "", position = "", ref = "", var = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = ""): + self._chrom = chrom + self._position = position + self._ref = ref + self._var = var + self._readsRef = readsRef + self._readsVar = readsVar + self._varFreq = varFreq + self._strandsRef = strandsRef + self._strandsVar = strandsVar + self._qualRef = qualRef + self._qualVar = qualVar + self._pValue = pValue + + ## Equal operator + # + # @param o a VarscanFileAnalysis instance + # + def __eq__(self, o): + return self._chrom == o._chrom and self._position == o._position and self._ref == o._ref and self._var == o._var + + def setChrom(self, chromosome): + self._chrom = chromosome + + def setPosition(self, position): + self._position = position + + def setRef(self, referenceAllele): + self._ref = referenceAllele + + def setVar(self, variantAllele): + self._var = variantAllele + + def setReadsRef(self, readsRef): + self._readsRef = readsRef + + def setReadsVar(self, readsVar): + self._readsVar = readsVar + + def setVarFreq(self, varFreq): + self._varFreq = varFreq + + def setStrandsRef(self, strandsRef): + self._strandsRef = strandsRef + + def setStrandsVar(self, strandsVar): + self._strandsVar = strandsVar + + def setQualRef(self, qualRef): + self._qualRef = qualRef + + def setQualVar(self, qualVar): + self._qualVar = qualVar + + def setPValue(self, pValue): + self._pValue = pValue + + def getChrom(self): + return self._chrom + + def getPosition(self): + return self._position + + def getRef(self): + return self._ref + + def getVar(self): + return self._var + + def getReadsRef(self): + return self._readsRef + + def getReadsVar(self): + return self._readsVar + + def getVarFreq(self): + return self._varFreq + + def getStrandsRef(self): + return self._strandsRef + + def getStrandsVar(self): + return self._strandsVar + + def getQualRef(self): + return self._qualRef + + def getQualVar(self): + return self._qualVar + + def getPValue(self): + return self._pValue + + def getHeader(self): + return "Chrom\tPosition\tRef\tVar\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getVar(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue()) + + def setAttributes(self, lResults, iCurrentLineNumber): + if lResults[0] != '': + self.setChrom(lResults[0]) + else: + raise CheckerException ("The field Chrom is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[1] != '': + self.setPosition(lResults[1]) + else: + raise CheckerException ("The field Position is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[2] != '': + self.setRef(lResults[2]) + else: + raise CheckerException ("The field Ref is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[3] != '': + self.setVar(lResults[3]) + else: + raise CheckerException ("The field Var is empty in varscan file in line %s" % iCurrentLineNumber) + if lResults[4] != '': + self.setReadsRef(lResults[4]) + if lResults[5] != '': + self.setReadsVar(lResults[5]) + if lResults[6] != '': + self.setVarFreq(lResults[6]) + if lResults[7] != '': + self.setStrandsRef(lResults[7]) + if lResults[8] != '': + self.setStrandsVar(lResults[8]) + if lResults[9] != '': + self.setQualRef(lResults[9]) + if lResults[10] != '': + self.setQualVar(lResults[10]) + if lResults[11] != '': + self.setPValue(lResults[11]) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem)<12: + for i in range(len(lvarscanStringItem), 12): + lvarscanStringItem.append ("") + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanHitForGnpSNP.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHitForGnpSNP.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,232 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+from commons.core.checker.CheckerException import CheckerException\n+from commons.core.parsing.VarscanHit import VarscanHit\n+import re\n+\n+class VarscanHitForGnpSNP(VarscanHit):\n+ \n+ def __init__(self):\n+ VarscanHit.__init__(self)\n+ self._reads1 = \'\'\n+ self._reads2 = \'\'\n+ self._varFreq = \'\'\n+ self._strands1 = \'\'\n+ self._strands2 = \'\'\n+ self._qual1 = \'\'\n+ self._qual2 = \'\'\n+ self._pvalue = \'\'\n+ self._5flank = \'\'\n+ self._3flank = \'\'\n+ self._gnpSnp_ref = \'\'\n+ self._gnpSnp_var = \'\'\n+ self._gnpSnp_position = 0\n+ self._polymType = \'\'\n+ self._polymLength = 0\n+ self._occurrence = 1\n+ \n+ ## Equal operator\n+ #\n+ # @param o a VarscanFileAnalysis instance\n+ # \n+ def __eq__(self, o):\n+ return VarscanHit.__eq__(self, o) \\\n+ and self._reads1 == o._reads1 and self._reads2 == o._reads2 \\\n+ and self._varFreq == o._varFreq and self._strands1 == o._strands1 \\\n+ and self._strands2 == o._strands2 and self._qual1 == o._qual1 \\\n+ and self._qual2 == o._qual2 and self._pvalue == o._pvalue \\\n+ and self._3flank == o._3flank and self._5flank == o._5flank \\\n+ and self._gnpSnp_position == o._gnpSnp_position and self._gnpSnp_ref == o._gnpSnp_ref \\\n+ and self._gnpSnp_var == o._gnpSnp_var and self._polymLength == o._polymLength \\\n+ and self._polymType == o._polymType and self._occurrence == o._occurrence\n+ \n+ def isPolymTypeAlreadyFoundAtThisChromAndThisPosition(self, iVarscanHitForGnpSNP):\n+ return self._chrom == iVarscanHitForGnpSNP.getChrom() \\\n+ and self._position == iVarscanHitForGnpSNP.getPosition() \\\n+ and self._polymType == iVarscanHitForGnpSNP.getPolymType()\n+ \n+ def manageOccurrence(self, iVarscanHitForGnpSNP=None):\n+ if iVarscanHitForGnpSNP != None and self.isPolymTypeAlreadyFoundAtThisChromAndThisPosition(iVarscanHitForGnpSNP):\n+ self._occurrence = iVarscanHitForGnpSNP.getOccurrence() + 1\n+ \n+ def formatAlleles2GnpSnp(self):\n+ if self.getVar().find("-") != -1:\n+ self._polymType = "DELETION"\n+ self._gnpSnp_position = int(self._position) + 1\n+ self._gnpSnp_ref = self._var[1:]\n+ self._g'..b'randsOfReferenceAllele):\n+ self._strands1 = strandsOfReferenceAllele\n+ \n+ def setStrands2(self, strandsOfVariantAllele):\n+ self._strands2 = strandsOfVariantAllele\n+ \n+ def setQual1(self, averageQualityOfRef):\n+ self._qual1 = averageQualityOfRef\n+ \n+ def setQual2(self, averageQualityOfVar):\n+ self._qual2 = averageQualityOfVar\n+ \n+ def setPvalue(self, pvalue):\n+ self._pvalue = pvalue\n+ \n+ def set5flank(self, s5flank):\n+ self._5flank = s5flank\n+ \n+ def set3flank(self, s3flank):\n+ self._3flank = s3flank\n+ \n+ def setGnpSNPRef(self, ref):\n+ self._gnpSnp_ref = ref\n+ \n+ def setGnpSNPVar(self, var):\n+ self._gnpSnp_var = var\n+ \n+ def setGnpSNPPosition(self, position):\n+ self._gnpSnp_position = position\n+ \n+ def setOccurrence(self, occurrence):\n+ self._occurrence = occurrence\n+ \n+ def setPolymType(self, polymType):\n+ self._polymType = polymType\n+ \n+ def setPolymLength(self, polymLength):\n+ self._polymLength = polymLength\n+ \n+ def getReads1(self):\n+ return self._reads1\n+ \n+ def getReads2(self):\n+ return self._reads2\n+ \n+ def getVarFreq(self):\n+ return self._varFreq\n+ \n+ def getStrands1(self):\n+ return self._strands1\n+ \n+ def getStrands2(self):\n+ return self._strands2\n+ \n+ def getQual1(self):\n+ return self._qual1\n+ \n+ def getQual2(self):\n+ return self._qual2\n+ \n+ def getPvalue(self):\n+ return self._pvalue\n+ \n+ def get5flank(self):\n+ return self._5flank\n+ \n+ def get3flank(self):\n+ return self._3flank\n+ \n+ def getPolymType(self):\n+ return self._polymType\n+ \n+ def getGnpSnpVar(self):\n+ return self._gnpSnp_var\n+ \n+ def getGnpSnpRef(self):\n+ return self._gnpSnp_ref\n+ \n+ def getGnpSnpPosition(self):\n+ return self._gnpSnp_position\n+ \n+ def getPolymLength(self):\n+ return self._polymLength\n+ \n+ def getOccurrence(self):\n+ return self._occurrence\n+ \n+ def setAttributes(self, lResults, iCurrentLineNumber):\n+ VarscanHit.setAttributes(self, lResults, iCurrentLineNumber)\n+ if lResults[4] != \'\':\n+ self.setReads1(lResults[4])\n+ else:\n+ raise CheckerException ("The field Reads1 is empty in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[5] != \'\':\n+ self.setReads2(lResults[5])\n+ else:\n+ raise CheckerException ("The field Reads2 is empty in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[6] != \'\' and re.match("[0-9\\,\\%]+", lResults[6]):\n+ self.setVarFreq(lResults[6])\n+ else:\n+ raise CheckerException ("The field VarFreq is empty or in bad format in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[7] != \'\':\n+ self.setStrands1(lResults[7])\n+ else:\n+ raise CheckerException ("The field Strands1 is empty in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[8] != \'\':\n+ self.setStrands2(lResults[8])\n+ else:\n+ raise CheckerException ("The field Strands2 is empty in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[9] != \'\':\n+ self.setQual1(lResults[9])\n+ else:\n+ raise CheckerException ("The field Qual1 is empty in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[10] != \'\':\n+ self.setQual2(lResults[10])\n+ else:\n+ raise CheckerException ("The field Qual2 is empty in varscan file in line %s" % (iCurrentLineNumber))\n+ if lResults[11] != \'\':\n+ self.setPvalue(lResults[11])\n+ else:\n+ raise CheckerException ("The field Pvalue is empty in varscan file in line %s" % (iCurrentLineNumber))\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanHit_WithTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit_WithTag.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,70 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + + +from commons.core.parsing.VarscanHit import VarscanHit + +class VarscanHit_WithTag(VarscanHit): + + def __init__(self, tag = "", chrom = "", position = "", ref = "", var = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = ""): + self._tag = tag + VarscanHit.__init__(self, chrom, position, ref, var, readsRef, readsVar, varFreq, strandsRef, strandsVar, qualRef, qualVar, pValue) + + def __eq__(self, o): + if self._tag == o._tag: + return VarscanHit.__eq__(self, o) + return False + + def setTag(self, tag): + self._tag = tag + + def getTag(self): + return self._tag + + def getHeader(self): + return "Chrom\tPosition\tRef\tVar\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\tTag\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getVar(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue(), self.getTag()) + + def setAttributes(self, lResults, iCurrentLineNumber): + VarscanHit.setAttributes(self, lResults, iCurrentLineNumber) + if lResults[12] != '': + self.setTag(lResults[12]) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem)<13: + for i in range(len(lvarscanStringItem), 13): + lvarscanStringItem.append ("") + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) + \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanHit_v2_2_8.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit_v2_2_8.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,176 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+from commons.core.checker.CheckerException import CheckerException\n+from commons.core.parsing.VarscanHit import VarscanHit\n+\n+class VarscanHit_v2_2_8(VarscanHit):\n+ \n+ def __init__(self, chrom = "", position = "", ref = "", cns = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = "", mapQualRef = "", mapQualVar = "", readsRefPlus = "", readsRefMinus = "", readsVarPlus = "", readsVarMinus = "", var = ""):\n+ self._cns = cns\n+ self._mapQualRef = mapQualRef\n+ self._mapQualVar = mapQualVar\n+ self._readsRefPlus = readsRefPlus\n+ self._readsRefMinus = readsRefMinus\n+ self._readsVarPlus = readsVarPlus\n+ self._readsVarMinus = readsVarMinus\n+ VarscanHit.__init__(self, chrom, position, ref, var, readsRef, readsVar, varFreq, strandsRef, strandsVar, qualRef, qualVar, pValue)\n+ \n+ ## Equal operator\n+ #\n+ # @param o a VarscanFileAnalysis instance\n+ # \n+ def __eq__(self, o):\n+ if self._cns == o._cns:\n+ return VarscanHit.__eq__(self, o)\n+ return False\n+ \n+ def setCns(self, consensus):\n+ self._cns = consensus\n+ \n+ def setMapQualRef(self, mapQualRef):\n+ self._mapQualRef = mapQualRef\n+ \n+ def setMapQualVar(self, mapQualVar):\n+ self._mapQualVar = mapQualVar\n+ \n+ def setReadsRefPlus(self, readsRefPlus):\n+ self._readsRefPlus = readsRefPlus\n+ \n+ def setReadsRefMinus(self, readsRefMinus):\n+ self._readsRefMinus = readsRefMinus\n+ \n+ def setReadsVarPlus(self, readsVarPlus):\n+ self._readsVarPlus = readsVarPlus\n+ \n+ def setReadsVarMinus(self, readsVarMinus):\n+ self._readsVarMinus = readsVarMinus\n+ \n+ def getCns(self):\n+ return self._cns\n+ \n+ def getMapQualRef(self):\n+ return self._mapQualRef\n+ \n+ def getMapQualVar(self):\n+ return self._mapQualVar\n+ \n+ def getReadsRefPlus(self):\n+ return self._readsRefPlus\n+ \n+ def getReadsRefMinus(self):\n+ return self._readsRefMinus\n+ \n+ def getReadsVarPlus(self):\n+ return self._readsVarPlus\n+ \n+ def getReadsVarMinus(self):\n+ return self._readsVarMinus\n+ \n+ def getHeader(self):\n+ return "Chrom\\tPosition\\tRef\\tC'..b'\\tStrands1\\tStrands2\\tQual1\\tQual2\\tPvalue\\tMapQual1\\tMapQual2\\tReads1Plus\\tReads1Minus\\tReads2Plus\\tReads2Minus\\tVarAllele\\n"\n+ \n+ def getVarscanLine(self):\n+ return "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getCns(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue(), self.getMapQualRef(), self.getMapQualVar(), self.getReadsRefPlus(), self.getReadsRefMinus(), self.getReadsVarPlus(), self.getReadsVarMinus(), self.getVar())\n+ \n+ def setAttributes(self, lResults, iCurrentLineNumber):\n+ if lResults[0] != \'\':\n+ self.setChrom(lResults[0])\n+ else:\n+ raise CheckerException ("The field Chrom is empty in varscan file in line %s" % iCurrentLineNumber)\n+ if lResults[1] != \'\':\n+ self.setPosition(lResults[1])\n+ else:\n+ raise CheckerException ("The field Position is empty in varscan file in line %s" % iCurrentLineNumber)\n+ if lResults[2] != \'\':\n+ self.setRef(lResults[2])\n+ else:\n+ raise CheckerException ("The field Ref is empty in varscan file in line %s" % iCurrentLineNumber)\n+ if lResults[3] != \'\':\n+ self.setCns(lResults[3])\n+ else:\n+ raise CheckerException ("The field Cons is empty in varscan file in line %s" % iCurrentLineNumber)\n+ if lResults[4] != \'\':\n+ self.setReadsRef(lResults[4])\n+ if lResults[5] != \'\':\n+ self.setReadsVar(lResults[5])\n+ if lResults[6] != \'\':\n+ self.setVarFreq(lResults[6])\n+ if lResults[7] != \'\':\n+ self.setStrandsRef(lResults[7])\n+ if lResults[8] != \'\':\n+ self.setStrandsVar(lResults[8])\n+ if lResults[9] != \'\':\n+ self.setQualRef(lResults[9])\n+ if lResults[10] != \'\':\n+ self.setQualVar(lResults[10])\n+ if lResults[11] != \'\':\n+ self.setPValue(lResults[11])\n+ if lResults[12] != \'\':\n+ self.setMapQualRef(lResults[12])\n+ if lResults[13] != \'\':\n+ self.setMapQualVar(lResults[13])\n+ if lResults[14] != \'\':\n+ self.setReadsRefPlus(lResults[14])\n+ if lResults[15] != \'\':\n+ self.setReadsRefMinus(lResults[15])\n+ if lResults[16] != \'\':\n+ self.setReadsVarPlus(lResults[16])\n+ if lResults[17] != \'\':\n+ self.setReadsVarMinus(lResults[17])\n+ if lResults[18] != \'\':\n+ self.setVar(lResults[18])\n+ else:\n+ raise CheckerException ("The field varAllele is empty in varscan file in line %s" % iCurrentLineNumber)\n+ \n+ def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\\t"):\n+ varscanString = varscanString.rstrip()\n+ lvarscanStringItem = varscanString.split(fieldSeparator)\n+ if len(lvarscanStringItem) < 19:\n+ raise CheckerException ("This varscan line (l.%s) is not complete" % iCurrentLineNumber)\n+ self.setAttributes(lvarscanStringItem, iCurrentLineNumber)\n+ \n+ def convertVarscanHit_v2_2_8_To_VarscanHit(self):\n+ iVarscanHit = VarscanHit()\n+ iVarscanHit.setChrom(self.getChrom())\n+ iVarscanHit.setPosition(self.getPosition())\n+ iVarscanHit.setRef(self.getRef())\n+ iVarscanHit.setVar(self.getVar())\n+ iVarscanHit.setReadsRef(self.getReadsRef())\n+ iVarscanHit.setReadsVar(self.getReadsVar())\n+ iVarscanHit.setVarFreq(self.getVarFreq())\n+ iVarscanHit.setStrandsRef(self.getStrandsRef())\n+ iVarscanHit.setStrandsVar(self.getStrandsVar())\n+ iVarscanHit.setQualRef(self.getQualRef())\n+ iVarscanHit.setQualVar(self.getQualVar())\n+ iVarscanHit.setPValue(self.getPValue())\n+ return iVarscanHit\n+ \n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanHit_v2_2_8_WithTag.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanHit_v2_2_8_WithTag.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,88 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.checker.CheckerException import CheckerException +from commons.core.parsing.VarscanHit_v2_2_8 import VarscanHit_v2_2_8 +from commons.core.parsing.VarscanHit_WithTag import VarscanHit_WithTag + +class VarscanHit_v2_2_8_WithTag(VarscanHit_v2_2_8): + + def __init__(self, chrom = "", position = "", ref = "", cns = "", readsRef = "", readsVar = "", varFreq = "", strandsRef = "", strandsVar = "", qualRef = "", qualVar = "", pValue = "", mapQualRef = "", mapQualVar = "", readsRefPlus = "", readsRefMinus = "", readsVarPlus = "", readsVarMinus = "", var = "", tag = ""): + self._tag = tag + VarscanHit_v2_2_8.__init__(self, chrom, position, ref, var, readsRef, readsVar, varFreq, strandsRef, strandsVar, qualRef, qualVar, pValue, mapQualRef, mapQualVar, readsRefPlus, readsRefMinus, readsVarPlus, readsVarMinus, var) + + def __eq__(self, o): + if self._tag == o._tag: + return VarscanHit_v2_2_8.__eq__(self, o) + return False + + def setTag(self, tag): + self._tag = tag + + def getTag(self): + return self._tag + + def getHeader(self): + return "Chrom\tPosition\tRef\tCons\tReads1\tReads2\tVarFreq\tStrands1\tStrands2\tQual1\tQual2\tPvalue\tMapQual1\tMapQual2\tReads1Plus\tReads1Minus\tReads2Plus\tReads2Minus\tVarAllele\tTag\n" + + def getVarscanLine(self): + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (self.getChrom(), self.getPosition(), self.getRef(), self.getCns(), self.getReadsRef(), self.getReadsVar(), self.getVarFreq(), self.getStrandsRef(), self.getStrandsVar(), self.getQualRef(), self.getQualVar(), self.getPValue(), self.getMapQualRef(), self.getMapQualVar(), self.getReadsRefPlus(), self.getReadsRefMinus(), self.getReadsVarPlus(), self.getReadsVarMinus(), self.getVar(), self.getTag()) + + def setAttributes(self, lResults, iCurrentLineNumber): + VarscanHit_v2_2_8.setAttributes(self, lResults, iCurrentLineNumber) + if lResults[19] != '': + self.setTag(lResults[19]) + else: + raise CheckerException ("The field tag is empty in varscan file in line %s" % iCurrentLineNumber) + + def setAttributesFromString(self, varscanString, iCurrentLineNumber ="", fieldSeparator ="\t"): + varscanString = varscanString.rstrip() + lvarscanStringItem = varscanString.split(fieldSeparator) + if len(lvarscanStringItem) < 20: + raise CheckerException ("This varscan line (l.%s) is not complete" % iCurrentLineNumber) + self.setAttributes(lvarscanStringItem, iCurrentLineNumber) + + def convertVarscanHit_v2_2_8_WithTag_To_VarscanHit_WithTag(self): + iVarscanHit = VarscanHit_WithTag() + iVarscanHit.setChrom(self.getChrom()) + iVarscanHit.setPosition(self.getPosition()) + iVarscanHit.setRef(self.getRef()) + iVarscanHit.setVar(self.getVar()) + iVarscanHit.setReadsRef(self.getReadsRef()) + iVarscanHit.setReadsVar(self.getReadsVar()) + iVarscanHit.setVarFreq(self.getVarFreq()) + iVarscanHit.setStrandsRef(self.getStrandsRef()) + iVarscanHit.setStrandsVar(self.getStrandsVar()) + iVarscanHit.setQualRef(self.getQualRef()) + iVarscanHit.setQualVar(self.getQualVar()) + iVarscanHit.setPValue(self.getPValue()) + iVarscanHit.setTag(self.getTag()) + return iVarscanHit \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/VarscanToVCF.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/VarscanToVCF.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,152 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import math +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +from commons.core.parsing.VarscanFile import VarscanFile +from commons.core.seq.Bioseq import Bioseq + +LOG_DEPTH = "core.parsing" + +##Reference launcher implementation +# +class VarscanToVCF(object): + + def __init__(self, varscanFileName = "", vcfFileName = "", doClean = False, verbosity = 0): + self._varscanFileName = varscanFileName + self.setvcfFileName(vcfFileName) + self._doClean = doClean + self._verbosity = verbosity + + self._vcfRevision = "VCFv4.1" + self._vcfHeader = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO" + + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "Conver Varscan file to VCF file." + epilog = "\t$ python VarscanToVCF.py -i varscanFileName -v 2" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--Varscan", dest = "varscanFileName", action = "store", type = "string", help = "input Varscan file name [compulsory] [format: varscan2.2.8]", default = "") + parser.add_option("-o", "--vcfFileName",dest = "vcfFileName", action = "store", type = "string", help = "vcfFileName file name [default: <input>.vcf]", default = "") + parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False) + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setvarscanFileName(options.varscanFileName) + self.setvcfFileName(options.vcfFileName) + self.setDoClean(options.doClean) + self.setVerbosity(options.verbosity) + + def setvarscanFileName(self, varscanFileName): + self._varscanFileName = varscanFileName + + def setvcfFileName(self, vcfFileName): + if vcfFileName == "": + self._vcfFileName = "%s.vcf" % self._varscanFileName + else: + self._vcfFileName = vcfFileName + + def setDoClean(self, doClean): + self._doClean = doClean + + def setVerbosity(self, verbosity): + self._verbosity = verbosity + + def _checkOptions(self): + if self._varscanFileName == "": + self._logAndRaise("ERROR: Missing input file name") + else: + if not FileUtils.isRessourceExists(self._varscanFileName): + self._logAndRaise("ERROR: Input Varscan file '%s' does not exist!" % self._varscanFileName) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def _convertVarscanLineToVCFRecord(self, varscanLine, lineNumber): + iVarscanFile = VarscanFile() + iVarscanFile.setTypeOfVarscanFile("Varscan_2_2_8") + iVarscanHit = iVarscanFile.createVarscanObjectFromLine(varscanLine, lineNumber) + Chrom = iVarscanHit.getChrom() + Pos = int(iVarscanHit.getPosition()) + #ID = str(lineNumber) + ID = "." + Ref = iVarscanHit.getRef() + Alt = iVarscanHit.getVar() + Qual = -10*math.log10(float(iVarscanHit.getPValue())) + Filter = "." + AF = float(iVarscanHit.getVarFreq()[:-1])/100 + DP = int(iVarscanHit.getReadsRef()) + int(iVarscanHit.getReadsVar()) + RBQ = iVarscanHit.getQualRef() + ABQ = iVarscanHit.getQualVar() + #MQ = iVarscanHit.getMapQualRef() + Info = ";".join(["AF=%.4f" %AF,"DP=%d" %DP,"RBQ=%s" %RBQ, "ABQ=%s" %ABQ]) + + allel = Bioseq().getATGCNFromIUPACandATGCN(iVarscanHit.getCns(), Ref) + if allel != Alt: + self._log.warning("'VarAllele' attribute of Varscan file line '%d' was not correct. Correcting using '%s' instead of '%s'." % (lineNumber, allel, Alt)) + Alt = allel + + vcfLine = "%s\t%s\t%s\t%s\t%s\t%.9f\t%s\t%s\n" % (Chrom, Pos, ID, Ref, Alt, Qual, Filter, Info) + return vcfLine + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START Varscan To VCF") + self._log.debug("Input file name: %s" % self._varscanFileName) + + with open(self._vcfFileName, "w") as fVCF: + fVCF.write("##fileformat=%s\n" % self._vcfRevision) + fVCF.write("%s\n" % self._vcfHeader) + + with open(self._varscanFileName, "r") as fVarscan: + lineNumber = 1 + line = fVarscan.readline() + while line: + if line[0] != "#" and "Chrom\tPosition\tRef\tCons" not in line: + vcfLine = self._convertVarscanLineToVCFRecord(line, lineNumber) + fVCF.write(vcfLine) + line = fVarscan.readline() + lineNumber += 1 + + self._log.info("END Varscan To VCF") + +if __name__ == "__main__": + iLaunch = VarscanToVCF() + iLaunch.setAttributesFromCmdLine() + iLaunch.run() \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/WigParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/WigParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,333 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import re\n+import sys\n+import os.path\n+import struct\n+from commons.core.parsing.TranscriptListParser import TranscriptListParser\n+from SMART.Java.Python.structure.Transcript import Transcript\n+\n+STRANDTOSTR = {1: "(+)", 0: "(=)", None: "(=)", -1: "(-)"}\n+\n+nbOpenHandles = 30\n+\n+\n+class WigParser(TranscriptListParser):\n+\t"""A class that parses a big WIG file, creates an index and make it possible to quickly retrieve some data"""\n+\n+\tdef __init__(self, fileName, verbosity = 1):\n+\t\tself.fileName\t\t = fileName\n+\t\tself.filler\t\t\t = "\\xFF" * struct.calcsize(\'Q\')\n+\t\tself.strands\t\t = False\n+\t\tself.indexFiles\t \t = {}\n+\t\tself.indexBuilt\t\t = False\n+\t\tself.defaultValue\t = 0.0\n+\t\tself.currentChromosome = None\n+\t\tself.currentStrand\t = 1\n+\t\tself.verbosity = verbosity\n+\t\tsuper(WigParser, self).__init__(fileName, verbosity)\n+\n+\n+\tdef __def__(self):\n+\t\tfor file in self.indexFiles.values():\n+\t\t\tfile.close()\n+\n+\n+\tdef setStrands(self, strands):\n+\t\tself.strands = strands\n+\n+\n+\tdef setDefaultValue(self, value):\n+\t\tself.defaultValue = value\n+\n+\n+\tdef getFileFormats():\n+\t\treturn ["wig"]\n+\tgetFileFormats = staticmethod(getFileFormats)\n+\n+\n+\tdef setStrands(self, strands):\n+\t\t"""\n+\t\tConsider both strands separately\n+\t\t"""\n+\t\tself.strands = strands\n+\n+\n+\tdef makeIndexName(self, chromosome, strand = None):\n+\t\t"""\n+\t\tCreate an index name for a file\n+\t\t"""\n+\t\tdirectoryName = os.path.dirname(self.fileName)\n+\t\tif strand == None:\n+\t\t\tstrandName = ""\n+\t\telse:\n+\t\t\tstrandName = "+" if strand == 1 else "-"\n+\t\tindexName = os.path.join(directoryName, ".%s%s.index" % (chromosome, strandName))\n+\t\treturn indexName\n+\t\n+\t\n+\tdef findIndexFile(self, chromosome, strand = None):\n+\t\t"""\n+\t\tCheck if the index of a file exists\n+\t\t""" \n+\t\tindexName = self.makeIndexName(chromosome, strand)\n+\t\tif os.path.exists(indexName):\n+\t\t\treturn indexName\n+\t\treturn False\n+\t\n+\t\n+\tdef makeIndexFile(self):\n+\t\t"""\n+\t\tCreate the index for a file\n+\t\t"""\n+\t\tif self.indexBuilt:\n+\t\t\treturn\n+\n+\t\tinputFile = open(self.fileName)\n+\t\toutputFile = None\n+\t\tindex\t = 0\n+\t\tmark\t = inputFile.tell()\n+\t\tline\t = inputFile.readline().strip()\n+\t\tchromosome = None\n+\n+\t\twhile line != "":\n+\t\t\tm1 = re.search(r"^\\s*-?\\d+\\.?\\d*\\s*$", line)\n+\t\t\tm2 = re.search(r"^\\s*(\\d+)\\s+-?\\d+\\.?\\d*\\s*$", line)\n+\t\t\tm3 = re.search(r"^\\s*fixedStep\\s+chrom=(\\S+)\\s+start=(\\d+)\\s+step=1\\s*$", line)\n+\t\t\tm4 = re.search(r"^\\s*fixedStep\\s+chrom=\\S+\\s+start=\\d+\\s+step=\\d+\\s+span=\\d+\\s*$", line)\n+\t\t\tm5 = re.search(r"^\\s*variable'..b'ndex for chromosome %s, strand %s does not exist." % (chromosome, STRANDTOSTR[strand])\n+\t\t\treturn False\n+\t\tindexFile = open(indexFileName, "rb")\n+\n+\t\tif len(self.indexFiles.keys()) > nbOpenHandles:\n+\t\t\tremovedKey = set(self.indexFiles.keys()).pop()\n+\t\t\tself.indexFiles[removedKey].close()\n+\t\t\tdel self.indexFiles[removedKey]\n+\t\tself.indexFiles[indexFileKey] = indexFile\n+\t\treturn indexFile\n+\t\t\n+\n+\t\n+\tdef findIndex(self, chromosome, start, strand = None):\n+\t\t"""\n+\t\tFind the point where to start reading file\n+\t\t"""\n+\n+\t\tsizeOfLong = struct.calcsize("Q")\n+\t\tempty\t = int(struct.unpack("Q", self.filler)[0])\n+\t\toffset\t = empty\n+\t\tindexFile = self.getIndexFileHandle(chromosome, strand)\n+\t\n+\t\tif not indexFile:\n+\t\t\treturn (None, None)\n+\t\t\n+\t\twhile offset == empty:\n+\t\t\taddress = start * sizeOfLong\n+\t\t\tindexFile.seek(address, os.SEEK_SET)\n+\t\t\t\n+\t\t\tbuffer = indexFile.read(sizeOfLong)\n+\t\t\tif len(buffer) != sizeOfLong:\n+\t\t\t\tif buffer == "":\n+\t\t\t\t\tprint "Warning! Index position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand])\n+\t\t\t\t\treturn (None, None)\n+\t\t\t\telse:\n+\t\t\t\t\traise Exception("Problem fetching position %d of chromosome %s on strand %s seems out of range!" % (start, chromosome, STRANDTOSTR[strand]))\n+\t\t\t\n+\t\t\toffset = int(struct.unpack("Q", buffer)[0])\n+\t\t\tstart += 1\n+\t\t\t\n+\t\tstart -= 1\n+\t\treturn (offset, start)\n+\t\n+\t\n+\n+\tdef getRange(self, chromosome, start, end):\n+\t\t"""\n+\t\tParse a wig file and output a range\n+\t\t"""\n+\t\tarrays = {}\n+\t\tstrands = {1: "+", -1: "-"} if self.strands else {0: ""}\n+\n+\t\tfor strand in strands:\n+\n+\t\t\tarray = [self.defaultValue] * (end - start + 1)\n+\t\t\tfile = open(self.fileName)\n+\t\t\toffset, index = self.findIndex(chromosome, start, strand if self.strands else None)\n+\t\t\tif offset == None:\n+\t\t\t\tarrays[strand] = array\n+\t\t\t\tcontinue\n+\t\t\tfile.seek(offset, os.SEEK_SET)\n+\n+\t\t\tfor line in file:\n+\t\t\t\tline = line.strip()\n+\n+\t\t\t\tm1 = re.search(r"^\\s*(-?\\d+\\.?\\d*)\\s*$", line)\n+\t\t\t\tm2 = re.search(r"^\\s*(\\d+)\\s+(-?\\d+\\.?\\d*)\\s*$", line)\n+\t\t\t\tm3 = re.search(r"^\\s*fixedStep\\s+chrom=(\\S+)\\s+start=(\\d+)\\s+step=\\d+\\s*$", line)\n+\t\t\t\tm4 = re.search(r"^\\s*variableStep\\s+chrom=(\\S+)\\s*$", line)\n+\n+\t\t\t\tif m1 != None:\n+\t\t\t\t\tif index > end:\n+\t\t\t\t\t\tbreak\n+\t\t\t\t\tif index >= start:\n+\t\t\t\t\t\tarray[index - start] = float(m1.group(1))\n+\t\t\t\t\tindex += 1\n+\t\t\t\telif m2 != None:\n+\t\t\t\t\tindex = int(m2.group(1))\n+\t\t\t\t\tif index > end:\n+\t\t\t\t\t\tbreak\n+\t\t\t\t\tif index >= start:\n+\t\t\t\t\t\tarray[index - start] = float(m2.group(2))\n+\t\t\t\t\tindex += 1\n+\t\t\t\telif m3 != None:\n+\t\t\t\t\tif m3.group(1) != "%s%s" % (chromosome, strands[strand]):\n+\t\t\t\t\t\tbreak\n+\t\t\t\t\tindex = int(m3.group(2))\n+\t\t\t\telif m4 != None:\n+\t\t\t\t\tif m4.group(1) != "%s%s" % (chromosome, strands[strand]):\n+\t\t\t\t\t\tbreak\n+\t\t\t\telif (len(line) == 0) or (line[0] == "#") or line.startswith("track"):\n+\t\t\t\t\tpass\n+\t\t\t\telse:\n+\t\t\t\t\traise Exception("Error! Cannot read line \'%s\' of wig file" % (line))\n+\n+\t\t\tfile.close()\n+\t\n+\t\t\tarrays[strand] = array\n+\t\t\t\n+\t\tif self.strands:\n+\t\t\treturn arrays\n+\t\treturn array\n+\t\n+\n+\tdef skipFirstLines(self):\n+\t\treturn\n+\n+\t\n+\tdef parseLine(self, line):\n+\t\tif line.startswith("track"):\n+\t\t\treturn None\n+\t\tm = re.search(r"^\\s*variableStep\\s+chrom=(\\S+)", line)\n+\t\tif m != None:\n+\t\t\tchromosome = m.group(1)\n+\t\t\tif chromosome.endswith("+"):\n+\t\t\t\tself.currentStrand = 1\n+\t\t\t\tself.currentChromosome = chromosome[:-1]\n+\t\t\telif chromosome.endswith("-"):\n+\t\t\t\tself.currentStrand = -1\n+\t\t\t\tself.currentChromosome = chromosome[:-1]\n+\t\t\telse:\n+\t\t\t\tself.currentStrand = 1\n+\t\t\t\tself.currentChromosome = chromosome\n+\t\t\treturn None\n+\t\tposition, value = line.split()\n+\t\tposition = int(position)\n+\t\tvalue\t= float(value)\n+\t\ttranscript = Transcript()\n+\t\ttranscript.setChromosome(self.currentChromosome)\n+\t\ttranscript.setStart(position)\n+\t\ttranscript.setEnd(position)\n+\t\ttranscript.setDirection(self.currentStrand)\n+\t\ttranscript.setTagValue("ID", "wig_%s_%d_%d" % (self.currentChromosome, self.currentStrand, position))\n+\t\ttranscript.setTagValue("nbElements", value)\n+\t\treturn transcript\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/parsing/multifastaParserLauncher.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/multifastaParserLauncher.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,110 @@ +#!/usr/bin/env python + +""" +Launcher for the multifasta parser. +@param b: Name of the batch of sequences +@param g: Name of the gene +@param t: Scientific name of the taxon concerned +@param f: Name of the multifasta input file +""" + + +import os +import sys +import getopt +from commons.core.parsing.Multifasta2SNPFile import Multifasta2SNPFile + +CURRENT_DIR = os.getcwd() + +def help(): + + """ + Give the list of the command-line options. + """ + + print "Usage: ",sys.argv[0],"[ options ]" + print " -h: this help" + print "Mandatory option:" + print " -t: Scientific name of the taxon concerned" + print "Exclusive options (use either the first or the second, one should be used)" + print " -f: Name of the multifasta input file in one batch mode" + print " -d: Name of the directory containing multifasta input file(s) in multi-batch mode" + print "Only in one batch mode: mandatory options (when -f is used):" + print " -b: Name of the batch of submitted sequences" + print " -g: Name of the gene" + print "" + + +def runOneInputFile(batchName, geneName, taxon, inputFileName): + print "Multifasta parseur launched:!\n" + print "-- Input File: " + inputFileName + "\n" + print "-- Batch name: " + batchName + "\n" + print "-- Gene name: " + geneName + "\n" + print "-- Taxon: " + taxon + "\n" + #TODO: gerer le delete des fichiers(mode append) + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, geneName) + multifasta2SNPFile.runOneBatch(inputFileName) + print "OK: Files generated!" + + +def runSeveralInputFile(taxon, rootDirectoryName): + multifasta2SNPFile = Multifasta2SNPFile(taxon) + multifasta2SNPFile.runSeveralBatches(rootDirectoryName) + +def main(): + batchName = "" + geneName = "" + taxon = "" + inputFileName = "" + rootDirectoryName = "" + + + try: + opts,args = getopt.getopt(sys.argv[1:],"hb:g:t:f:d:") + except getopt.GetoptError: + print "Invalid options\n" + help() + sys.exit(2) + + for o, a in opts: + if o == "-h": + help() + exit(0) + elif o == "-b": + batchName = a + elif o == "-g": + geneName = a + elif o == "-t": + taxon = a + elif o == "-f": + inputFileName = a + elif o == "-d": + rootDirectoryName = os.path.abspath(a) + + if taxon == "": + print "*** Error: The mandatory option -t is missing" + help() + sys.exit(1) + + if (inputFileName == "" and rootDirectoryName == "") or (inputFileName != "" and rootDirectoryName != ""): + print "*** Error: You have to specify the input mode: choose either -f (for one file) or -d (for one directory of several files)" + help() + sys.exit(1) + + if(inputFileName != ""): + if batchName == "" or geneName == "": + print "*** Error: A mandatory option is missing in one batch mode (-b or -g)" + help() + sys.exit(1) + + if(inputFileName != ""): + runOneInputFile(batchName, geneName, taxon, inputFileName) + else: + runSeveralInputFile(taxon, rootDirectoryName) + + + return 0 + +#------------------------------------------------------------------------------ +if __name__ == "__main__": + main() \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/seq/AlignedBioseqDB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/AlignedBioseqDB.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,440 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import sys\n+from commons.core.seq.BioseqDB import BioseqDB\n+from commons.core.seq.Bioseq import Bioseq\n+from commons.core.coord.Align import Align\n+from commons.core.coord.Range import Range\n+from commons.core.stat.Stat import Stat\n+from math import log\n+\n+\n+## Multiple Sequence Alignment Representation \n+# \n+#\n+class AlignedBioseqDB( BioseqDB ):\n+ \n+ def __init__( self, name="" ):\n+ BioseqDB.__init__( self, name )\n+ seqLength = self.getLength()\n+ if self.getSize() > 1:\n+ for bs in self.db[1:]:\n+ if bs.getLength() != seqLength:\n+ print "ERROR: aligned sequences have different length"\n+ \n+ \n+ ## Get length of the alignment\n+ # \n+ # @return length\n+ # @warning name before migration was \'length\'\n+ #\n+ def getLength( self ):\n+ length = 0\n+ if self.db != []:\n+ length = self.db[0].getLength()\n+ return length\n+ \n+ \n+ ## Get the true length of a given sequence (without gaps)\n+ #\n+ # @param header string header of the sequence to analyze\n+ # @return length integer\n+ # @warning name before migration was \'true_length\'\n+ #\n+ def getSeqLengthWithoutGaps( self, header ):\n+ bs = self.fetch( header )\n+ count = 0\n+ for pos in xrange(0,len(bs.sequence)):\n+ if bs.sequence[pos] != "-":\n+ count += 1\n+ return count\n+ \n+ def cleanMSA( self ):\n+ #TODO: Refactoring\n+ """clean the MSA"""\n+ i2del = []\n+\n+ # for each sequence in the MSA\n+ for seqi in xrange(0,self.getSize()):\n+ if seqi in i2del:\n+ continue\n+ #define it as the reference\n+ ref = self.db[seqi].sequence\n+ refHeader = self.db[seqi].header\n+ # for each following sequence\n+ for seq_next in xrange(seqi+1,self.getSize()):\n+ if seq_next in i2del:\n+ continue\n+ keep = 0\n+ # for each position along the MSA\n+ for posx in xrange(0,self.getLength()):\n+ seq = self.db[seq_next].sequence\n+ if seq[posx] != \'-\' and ref[posx] != \'-\':\n+ keep = 1\n+ break\n+ seqHeader = self.db[s'..b'urn 0.0\n+ else:\n+ freq = nbOcc / float(nbNt)\n+ return - freq * log(freq) / log(2) \n+ \n+ \n+ ## Save the multiple alignment as a matrix with \'0\' if gap, \'1\' otherwise\n+ #\n+ def saveAsBinaryMatrix( self, outFile ):\n+ outFileHandler = open( outFile, "w" )\n+ for bs in self.db:\n+ string = "%s" % ( bs.header )\n+ for nt in bs.sequence:\n+ if nt != "-":\n+ string += "\\t%i" % ( 1 )\n+ else:\n+ string += "\\t%i" % ( 0 )\n+ outFileHandler.write( "%s\\n" % ( string ) )\n+ outFileHandler.close()\n+ \n+ \n+ ## Return a list of Align instances corresponding to the aligned regions (without gaps)\n+ #\n+ # @param query string header of the sequence considered as query\n+ # @param subject string header of the sequence considered as subject\n+ #\n+ def getAlignList( self, query, subject ):\n+ lAligns = []\n+ alignQ = self.fetch( query ).sequence\n+ alignS = self.fetch( subject ).sequence\n+ createNewAlign = True\n+ indexAlign = 0\n+ indexQ = 0\n+ indexS = 0\n+ while indexAlign < len(alignQ):\n+ if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":\n+ indexQ += 1\n+ indexS += 1\n+ if createNewAlign:\n+ iAlign = Align( Range( query, indexQ, indexQ ),\n+ Range( subject, indexS, indexS ),\n+ 0,\n+ int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),\n+ int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )\n+ lAligns.append( iAlign )\n+ createNewAlign = False\n+ else:\n+ lAligns[-1].range_query.end += 1\n+ lAligns[-1].range_subject.end += 1\n+ lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )\n+ lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )\n+ else:\n+ if not createNewAlign:\n+ lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()\n+ createNewAlign = True\n+ if alignQ[ indexAlign ] != "-":\n+ indexQ += 1\n+ elif alignS[ indexAlign ] != "-":\n+ indexS += 1\n+ indexAlign += 1\n+ if not createNewAlign:\n+ lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()\n+ return lAligns\n+ \n+ \n+ def removeGaps(self):\n+ for iBs in self.db:\n+ iBs.removeSymbol( "-" )\n+ \n+ ## Compute mean per cent identity for MSA. \n+ # First sequence in MSA is considered as reference sequence. \n+ #\n+ # \n+ def computeMeanPcentIdentity(self):\n+ seqRef = self.db[0]\n+ sumPcentIdentity = 0\n+\n+ for seq in self.db[1:]:\n+ pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq) \n+ sumPcentIdentity = sumPcentIdentity + pcentIdentity\n+ \n+ nbSeq = len(self.db[1:])\n+ meanPcentIdentity = round (sumPcentIdentity/nbSeq)\n+ \n+ return meanPcentIdentity\n+\n+ def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):\n+ indexOnSeqRef = 0\n+ sumIdentity = 0\n+ for nuclSeq in seq.sequence:\n+ nuclRef = seqRef.sequence[indexOnSeqRef]\n+ \n+ if nuclRef != "-" and nuclRef == nuclSeq:\n+ sumIdentity = sumIdentity + 1\n+ indexOnSeqRef = indexOnSeqRef + 1 \n+ \n+ return float(sumIdentity) / float(seqRef.getLength()) * 100 \n+\n+ \n+\n+\n+ \n+ \n+ \n+ \n+ \n+ \n+ \n+ \n+ \n+ \n+\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/seq/Bioseq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/Bioseq.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,735 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import sys\n+import string\n+import re\n+import random\n+import cStringIO\n+from commons.core.coord.Map import Map\n+from commons.core.checker.RepetException import RepetException\n+\n+DNA_ALPHABET_WITH_N = set( [\'A\',\'T\',\'G\',\'C\',\'N\'] )\n+IUPAC = set([\'A\',\'T\',\'G\',\'C\',\'U\',\'R\',\'Y\',\'M\',\'K\',\'W\',\'S\',\'B\',\'D\',\'H\',\'V\',\'N\'])\n+\n+\n+## Record a sequence with its header\n+#\n+class Bioseq( object ):\n+ \n+ header = ""\n+ sequence = ""\n+ \n+ ## constructor\n+ #\n+ # @param name the header of sequence\n+ # @param seq sequence (DNA, RNA, protein)\n+ #\n+ def __init__( self, name="", seq="" ):\n+ self.header = name\n+ self.sequence = seq\n+ \n+ \n+ ## Equal operator\n+ # \n+ def __eq__( self, o ):\n+ if self.header==o.header and self.sequence==o.sequence:\n+ return True\n+ return False\n+ \n+ \n+ ## overload __repr__\n+ #\n+ def __repr__( self ):\n+ return "%s;%s" % ( self.header, self.sequence )\n+ \n+ \n+ ## set attribute header\n+ #\n+ # @param header a string\n+ #\n+ def setHeader( self, header ):\n+ self.header = header\n+ \n+ \n+ ## get attribute header\n+ #\n+ # @return header\n+ def getHeader(self):\n+ return self.header\n+ \n+ \n+ ## set attribute sequence\n+ #\n+ # @param sequence a string\n+ #\n+ def setSequence( self, sequence ):\n+ self.sequence = sequence\n+ \n+ \n+ def getSequence(self):\n+ return self.sequence\n+ \n+ ## reset\n+ #\n+ def reset( self ):\n+ self.setHeader( "" )\n+ self.setSequence( "" )\n+ \n+ \n+ ## Test if bioseq is empty\n+ #\n+ def isEmpty( self ):\n+ return self.header == "" and self.sequence == ""\n+ \n+ \n+ ## Reverse the sequence\n+ #\n+ def reverse( self ):\n+ tmp = self.sequence\n+ self.sequence = tmp[::-1]\n+ \n+ \n+ ## Turn the sequence into its complement\n+ # Force upper case letters\n+ # @warning: old name in pyRepet.Bioseq realComplement\n+ #\n+ def complement( self ):\n+ complement = ""\n+ self.upCase()\n+ for i in xrange(0,len(self.sequence),1):\n+ if self.sequence[i] == "A":\n+ complement += "T"\n+ elif self.sequence[i] == "T":\n+ complement += "A"\n+ elif self.s'..b'etLMapWhithoutGap( self ):\n+ lMaps = []\n+ countSite = 1\n+ countSubseq = 1\n+ inGap = False\n+ startMap = -1\n+ endMap = -1\n+\n+ # initialize with the first site\n+ if self.sequence[0] == "-":\n+ inGap = True\n+ else:\n+ startMap = countSite\n+\n+ # for each remaining site\n+ for site in self.sequence[1:]:\n+ countSite += 1\n+\n+ # if it is a gap\n+ if site == "-":\n+\n+ # if this is the beginning of a gap, record the previous subsequence\n+ if inGap == False:\n+ inGap = True\n+ endMap = countSite - 1\n+ lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )\n+ countSubseq += 1\n+\n+ # if it is NOT a gap\n+ if site != "-":\n+\n+ # if it is the end of a gap, begin the next subsequence\n+ if inGap == True:\n+ inGap = False\n+ startMap = countSite\n+\n+ # if it is the last site\n+ if countSite == self.getLength():\n+ endMap = countSite\n+ lMaps.append( Map( "%s_subSeq%i" % (self.header,countSubseq), self.header, startMap, endMap ) )\n+\n+ return lMaps\n+ \n+ \n+ ## get the percentage of GC\n+ #\n+ # @return a percentage\n+ # \n+ def getGCpercentage( self ):\n+ tmpSeq = self.getSeqWithOnlyATGCN()\n+ nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )\n+ return 100 * nbGC / float( self.getLength() )\n+ \n+ ## get the percentage of GC of a sequence without counting N in sequence length\n+ #\n+ # @return a percentage\n+ # \n+ def getGCpercentageInSequenceWithoutCountNInLength(self):\n+ tmpSeq = self.getSeqWithOnlyATGCN()\n+ nbGC = tmpSeq.count( "G" ) + tmpSeq.count( "C" )\n+ return 100 * nbGC / float( self.getLength() - self.countNt("N") )\n+ \n+ ## get the 5 prime subsequence of a given length at the given position \n+ #\n+ # @param position integer\n+ # @param flankLength integer subsequence length\n+ # @return a sequence string\n+ # \n+ def get5PrimeFlank(self, position, flankLength):\n+ if(position == 1):\n+ return ""\n+ else:\n+ startOfFlank = 1\n+ endOfFlank = position -1\n+ \n+ if((position - flankLength) > 0):\n+ startOfFlank = position - flankLength\n+ else:\n+ startOfFlank = 1\n+ \n+ return self.subseq(startOfFlank, endOfFlank).sequence\n+ \n+ \n+ ## get the 3 prime subsequence of a given length at the given position \n+ # In the case of indels, the polymorphism length can be specified\n+ #\n+ # @param position integer\n+ # @param flankLength integer subsequence length\n+ # @param polymLength integer polymorphism length\n+ # @return a sequence string\n+ # \n+ def get3PrimeFlank(self, position, flankLength, polymLength = 1):\n+ if((position + polymLength) > len( self.sequence )):\n+ return ""\n+ else:\n+ startOfFlank = position + polymLength\n+ \n+ if((position+polymLength+flankLength) > len( self.sequence )):\n+ endOfFlank = len( self.sequence )\n+ else:\n+ endOfFlank = position+polymLength+flankLength-1\n+ \n+ return self.subseq(startOfFlank, endOfFlank).sequence\n+ \n+ \n+ def _createWordList(self,size,l=[\'A\',\'T\',\'G\',\'C\']):\n+ if size == 1 :\n+ return l\n+ else:\n+ l2 = []\n+ for i in l:\n+ for j in [\'A\',\'T\',\'G\',\'C\']:\n+ l2.append( i + j )\n+ return self._createWordList(size-1,l2)\n+ \n+ \n+ def removeSymbol( self, symbol ):\n+ tmp = self.sequence.replace( symbol, "" )\n+ self.sequence = tmp\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/seq/BioseqDB.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/BioseqDB.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,461 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import sys\n+import re\n+from commons.core.seq.Bioseq import Bioseq\n+from commons.core.stat.Stat import Stat\n+\n+\n+## Handle a collection of a Bioseq (header-sequence) \n+#\n+class BioseqDB( object ):\n+ \n+ def __init__( self, name="" ):\n+ self.idx = {}\n+ self.idx_renamed = {}\n+ self.db = []\n+ self.name = name\n+ if name != "":\n+ faFile = open( name )\n+ self.read( faFile )\n+ faFile.close()\n+ self.mean_seq_lgth = None\n+ self.stat = Stat()\n+ \n+ \n+ ## Equal operator\n+ #\n+ def __eq__( self, o ):\n+ selfSize = self.getSize()\n+ if selfSize != o.getSize():\n+ return False\n+ nbEqualInstances = 0\n+ for i in self.db:\n+ atLeastOneIsEqual = False\n+ for j in o.db:\n+ if i == j:\n+ atLeastOneIsEqual = True\n+ continue\n+ if atLeastOneIsEqual:\n+ nbEqualInstances += 1\n+ if nbEqualInstances == selfSize:\n+ return True\n+ return False\n+ \n+ \n+ ## Change the name of the BioseqDB\n+ #\n+ # @param name the BioseqDB name\n+ # \n+ def setName(self, name):\n+ self.name = name\n+ \n+ \n+ ## Record each sequence of the input file as a list of Bioseq instances\n+ #\n+ # @param faFileHandler handler of a fasta file\n+ #\n+ def read( self, faFileHandler ):\n+ while True:\n+ seq = Bioseq()\n+ seq.read( faFileHandler )\n+ if seq.sequence == None:\n+ break\n+ self.add( seq )\n+ \n+ \n+ ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long)\n+ #\n+ # @param faFileHandler file handler of a fasta file\n+ #\n+ def write( self, faFileHandler ):\n+ for bs in self.db:\n+ bs.writeABioseqInAFastaFile( faFileHandler )\n+ \n+ \n+ ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long)\n+ #\n+ # @param outFaFileName file name of fasta file\n+ # @param mode \'write\' or \'append\'\n+ #\n+ def save( self, outFaFileName, mode="w" ):\n+ outFaFile = open( outFaFileName, mode )\n+ self.write( outFaFile )\n+ outFaFile.close()\n+ \n+ \n+ ## Read a formatted fasta file and l'..b'on of wished Bioseq header\n+ # @param inFileName name of fasta file in which we want extract the BioseqDB\n+ #\n+ def extractPatternOfFile(self, pattern, inFileName):\n+ if pattern=="" :\n+ return\n+ srch=re.compile(pattern)\n+ file_db=open(inFileName)\n+ numseq=0\n+ nbsave=0\n+ while 1:\n+ seq=Bioseq()\n+ seq.read(file_db)\n+ if seq.sequence==None:\n+ break\n+ numseq+=1\n+ m=srch.search(seq.header)\n+ if m:\n+ self.add(seq)\n+ nbsave+=1\n+ file_db.close()\n+ \n+ \n+ ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern\n+ #\n+ # @param pattern regular expression of wished Bioseq header\n+ #\n+ # @return a BioseqDB\n+ #\n+ def getByPattern(self,pattern):\n+ if pattern=="" :\n+ return\n+ iBioseqDB=BioseqDB()\n+ srch=re.compile(pattern)\n+ for iBioseq in self.db:\n+ if srch.search(iBioseq.header):\n+ iBioseqDB.add(iBioseq)\n+ return iBioseqDB\n+ \n+ \n+ ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern\n+ #\n+ # @param pattern regular expression of not wished Bioseq header\n+ #\n+ # @return a BioseqDB\n+ #\n+ def getDiffFromPattern(self,pattern):\n+ if pattern=="" :\n+ return\n+ iBioseqDB=BioseqDB()\n+ srch=re.compile(pattern)\n+ for iBioseq in self.db:\n+ if not srch.search(iBioseq.header):\n+ iBioseqDB.add(iBioseq)\n+ return iBioseqDB\n+ \n+ #TODO: to run several times to remove all concerned sequences when big data. How to fix it ?\n+ ## Remove from the instance all Bioseq which header contains the specified pattern\n+ #\n+ # @param pattern regular expression of not wished Bioseq header\n+ #\n+ def rmByPattern(self,pattern):\n+ if pattern=="" :\n+ return\n+ srch=re.compile(pattern)\n+ for seq in self.db:\n+ if srch.search(seq.header):\n+ self.db.remove(seq) \n+ \n+ \n+ ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern\n+ # \n+ # @warning this method is called extractPattern in pyRepet.seq.BioseqDB\n+ #\n+ # @param pattern regular expression of wished Bioseq header\n+ # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq\n+ #\n+ def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB):\n+ if pattern=="" :\n+ return\n+ srch=re.compile(pattern)\n+ for seq in sourceBioseqDB.db:\n+ m=srch.search(seq.header)\n+ if m:\n+ self.add(seq) \n+ \n+ \n+ ## Up-case the sequence characters in all sequences\n+ # \n+ def upCase( self ):\n+ for bs in self.db:\n+ bs.upCase()\n+ \n+ \n+ ## Split each gapped Bioseq in a list and store all in a dictionary\n+ #\n+ # @return a dict, keys are bioseq headers, values are list of Map instances \n+ #\n+ def getDictOfLMapsWithoutGaps( self ):\n+ dSeq2Maps = {}\n+\n+ for bs in self.db:\n+ dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap()\n+\n+ return dSeq2Maps\n+\n+ ## Give the list of the sequence length in the bank\n+ #\n+ # @return an list\n+ #\n+ def getListOfSequencesLength( self ):\n+ lLength = []\n+ for iBioseq in self.db:\n+ lLength.append(iBioseq.getLength())\n+\n+ return lLength\n+ \n+ ## Return sequence length for a list of sequence header\n+ #\n+ def getSeqLengthByListOfName( self, lHeaderName ):\n+ lseqLength=[]\n+ for headerName in lHeaderName: \n+ lseqLength.append(self.getSeqLength( headerName ))\n+ return lseqLength\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/seq/BioseqUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/BioseqUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,296 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import math\n+import re\n+from commons.core.seq.Bioseq import Bioseq\n+\n+## Static methods for sequences manipulation\n+#\n+class BioseqUtils(object):\n+ \n+ ## Translate a nucleotide sequence\n+ #\n+ # @param bioSeqInstanceToTranslate a bioseq instance to translate\n+ # @param phase a integer : 1 (default), 2 or 3\n+ # \n+ def translateSequence(bioSeqInstanceToTranslate, phase=1):\n+ pep = ""\n+ #length = math.floor((len(self.sequence)-phase-1)/3)*3\n+ length = int( math.floor( ( len(bioSeqInstanceToTranslate.sequence )-( phase-1 ) )/3 )*3 )\n+ #We need capital letters !\n+ bioSeqInstanceToTranslate.upCase() \n+ sequence = bioSeqInstanceToTranslate.sequence \n+ for i in xrange(phase-1,length,3):\n+ if (sequence[i:i+3] == "TTT" or sequence[i:i+3] == "TTC"):\n+ pep = pep + "F"\n+ elif ( sequence[i:i+3] == "TTA" or sequence[i:i+3] == "TTG" ):\n+ pep = pep + "L"\n+ elif ( sequence[i:i+2] == "CT" ):\n+ pep = pep + "L"\n+ elif ( sequence[i:i+3] == "ATT" or sequence[i:i+3] == "ATC" or sequence[i:i+3] == "ATA" ):\n+ pep = pep + "I"\n+ elif ( sequence[i:i+3] == "ATG" ):\n+ pep = pep + "M"\n+ elif ( sequence[i:i+2] == "GT" ):\n+ pep = pep + "V"\n+ elif ( sequence[i:i+2] == "TC" ) :\n+ pep = pep + "S"\n+ elif ( sequence[i:i+2] == "CC" ) :\n+ pep = pep + "P"\n+ elif ( sequence[i:i+2] == "AC" ) :\n+ pep = pep + "T"\n+ elif ( sequence[i:i+2] == "GC" ) :\n+ pep = pep + "A"\n+ elif ( sequence[i:i+3] == "TAT" or sequence[i:i+3] == "TAC" ) :\n+ pep = pep + "Y"\n+ elif ( sequence[i:i+3] == "TAA" or sequence[i:i+3] == "TAG" ) :\n+ pep = pep + "*"\n+ elif ( sequence[i:i+3] == "CAT" or sequence[i:i+3] == "CAC" ) :\n+ pep = pep + "H"\n+ elif ( sequence[i:i+3] == "CAA" or sequence[i:i+3] == "CAG" ) :\n+ pep = pep + "Q"\n+ elif ( sequence[i:i+3] == "AAT" or sequence[i:i+3] == "AAC" ) :\n+ pep = pep + "N"\n+ elif ( sequence[i:i+3] == "AAA" or sequence[i:i+3] == "AAG" ) :\n+ pep = pep + "K"\n+ elif ( se'..b'\n+ writeBioseqListIntoFastaFile = staticmethod( writeBioseqListIntoFastaFile )\n+ \n+ ## read in a fasta file and create a list of bioseq instances\n+ #\n+ # @param fileName string\n+ # @return a list of bioseq\n+ #\n+ def extractBioseqListFromFastaFile( fileName ):\n+ file = open( fileName )\n+ lBioseq = []\n+ currentHeader = ""\n+ while currentHeader != None:\n+ bioseq = Bioseq()\n+ bioseq.read(file)\n+ currentHeader = bioseq.header\n+ if currentHeader != None:\n+ lBioseq.append(bioseq)\n+ return lBioseq\n+ \n+ extractBioseqListFromFastaFile = staticmethod( extractBioseqListFromFastaFile )\n+ \n+ ## Give the length of a sequence search by name\n+ #\n+ # @param lBioseq a list of bioseq instances\n+ # @param seqName string\n+ # @return an integer\n+ #\n+ def getSeqLengthWithSeqName( lBioseq, seqName ):\n+ length = 0\n+ for bioseq in lBioseq:\n+ if bioseq.header == seqName:\n+ length = bioseq.getLength()\n+ break \n+ return length\n+\n+ getSeqLengthWithSeqName = staticmethod( getSeqLengthWithSeqName )\n+\n+ def _translateInPositiveFrames( bioSeqInstanceToTranslate ):\n+ seq1 = bioSeqInstanceToTranslate.copyBioseqInstance()\n+ BioseqUtils.setFrameInfoOnHeader(seq1, 1)\n+ BioseqUtils.translateSequence(seq1, 1)\n+ seq2 = bioSeqInstanceToTranslate.copyBioseqInstance()\n+ BioseqUtils.setFrameInfoOnHeader(seq2, 2)\n+ BioseqUtils.translateSequence(seq2, 2)\n+ seq3 = bioSeqInstanceToTranslate.copyBioseqInstance()\n+ BioseqUtils.setFrameInfoOnHeader(seq3, 3)\n+ BioseqUtils.translateSequence(seq3, 3)\n+ return [seq1, seq2, seq3]\n+ \n+ _translateInPositiveFrames = staticmethod( _translateInPositiveFrames )\n+ \n+ def _translateInNegativeFrames(bioSeqInstanceToTranslate):\n+ seq4 = bioSeqInstanceToTranslate.copyBioseqInstance()\n+ seq4.reverseComplement()\n+ BioseqUtils.setFrameInfoOnHeader(seq4, 4)\n+ BioseqUtils.translateSequence(seq4, 1)\n+ seq5 = bioSeqInstanceToTranslate.copyBioseqInstance()\n+ seq5.reverseComplement()\n+ BioseqUtils.setFrameInfoOnHeader(seq5, 5)\n+ BioseqUtils.translateSequence(seq5, 2)\n+ seq6 = bioSeqInstanceToTranslate.copyBioseqInstance()\n+ seq6.reverseComplement()\n+ BioseqUtils.setFrameInfoOnHeader(seq6, 6)\n+ BioseqUtils.translateSequence(seq6, 3)\n+ return [seq4, seq5, seq6]\n+ \n+ _translateInNegativeFrames = staticmethod( _translateInNegativeFrames )\n+ \n+ \n+ ## Return a dictionary which keys are sequence headers and values sequence lengths.\n+ #\n+ def getLengthPerSeqFromFile( inFile ):\n+ dHeader2Length = {}\n+ inFileHandler = open( inFile, "r" )\n+ while True:\n+ iBs = Bioseq()\n+ iBs.read( inFileHandler )\n+ if iBs.sequence == None:\n+ break\n+ dHeader2Length[ iBs.header ] = iBs.getLength()\n+ inFileHandler.close()\n+ return dHeader2Length\n+ \n+ getLengthPerSeqFromFile = staticmethod( getLengthPerSeqFromFile )\n+ \n+ \n+ ## Return the list of Bioseq instances, these being sorted in decreasing length\n+ #\n+ def getBioseqListSortedByDecreasingLength( lBioseqs ):\n+ return sorted( lBioseqs, key=lambda iBs: ( iBs.getLength() ), reverse=True )\n+ \n+ getBioseqListSortedByDecreasingLength = staticmethod( getBioseqListSortedByDecreasingLength )\n+ \n+ \n+ ## Return the list of Bioseq instances, these being sorted in decreasing length (without gaps)\n+ #\n+ def getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ):\n+ return sorted( lBioseqs, key=lambda iBs: ( len(iBs.sequence.replace("-","")) ), reverse=True )\n+ \n+ getBioseqListSortedByDecreasingLengthWithoutGaps = staticmethod( getBioseqListSortedByDecreasingLengthWithoutGaps )\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/seq/ClusterConsensusCollection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/ClusterConsensusCollection.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,66 @@ +import re +from commons.core.seq.BioseqDB import BioseqDB + +## Record a collection of bioseqDB representing cluster consensus +# +class ClusterConsensusCollection(object): + + ## constructor + # + # @param clusterFileName string name of file containing the cluster of consensus + # + def __init__(self, clusterFileName): + self._clusterFileName = clusterFileName + self._lClusterConsensus = [] + + def __eq__(self, o): + return self._clusterFileName == o._clusterFileName and self._lClusterConsensus == o._lClusterConsensus + + def getLClusterConsensus(self): + return self._lClusterConsensus + + def fillCollection(self): + iBioseqDBAllCluster = BioseqDB() + fClusterFile = open(self._clusterFileName, "r") + iBioseqDBAllCluster.read(fClusterFile) + fClusterFile.close() + lHeader = iBioseqDBAllCluster.getHeaderList() + firstHeader = lHeader[0] + previousClusterName, seqHeader = self._getClusterNameAndSeqHeader(firstHeader) + clusterConsensus = BioseqDB() + clusterConsensus.setName(previousClusterName) + self._addBioseqInClusterConsensus(iBioseqDBAllCluster, firstHeader, seqHeader, clusterConsensus) + for header in lHeader[1:]: + clusterName, seqHeader = self._getClusterNameAndSeqHeader(header) + if clusterName != previousClusterName: + self._lClusterConsensus.append(clusterConsensus) + previousClusterName = clusterName + clusterConsensus = BioseqDB() + clusterConsensus.setName(previousClusterName) + self._addBioseqInClusterConsensus(iBioseqDBAllCluster, header, seqHeader, clusterConsensus) + self._lClusterConsensus.append(clusterConsensus) + + def _getClusterNameAndSeqHeader(self, header): + m = re.match("(\D*)(\d+)Mb\d+\s.*", header) + clusterNumber = m.group(2) + clusterName = m.group(1) + clusterNumber + lPartsHeaderheader = header.split(" ") + seqHeader = lPartsHeaderheader[1] + return clusterName, seqHeader + + def _addBioseqInClusterConsensus(self, iBioseqDBAllCluster, firstHeader, seqHeader, clusterConsensus): + ibioseq = iBioseqDBAllCluster.fetch(firstHeader) + ibioseq.setHeader(seqHeader) + clusterConsensus.add(ibioseq) + + def getNumClusterForAConsensus(self, seqName): + nbCluster = 1 + for bioseqDB in self._lClusterConsensus: + if seqName in bioseqDB.getHeaderList(): + return nbCluster + nbCluster += 1 + + def getNumConsensusInCluster(self, numCluster): + return self._lClusterConsensus[numCluster - 1].getSize() + + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/seq/FastaUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/seq/FastaUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,1197 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import os\n+import sys\n+import string\n+import math\n+import shutil\n+import re\n+import glob\n+from operator import itemgetter\n+from commons.core.seq.BioseqDB import BioseqDB\n+from commons.core.seq.Bioseq import Bioseq\n+from commons.core.coord.MapUtils import MapUtils\n+from commons.core.coord.Range import Range\n+from commons.core.checker.CheckerUtils import CheckerUtils\n+from commons.core.launcher.LauncherUtils import LauncherUtils\n+from commons.core.coord.ConvCoord import ConvCoord\n+from commons.core.parsing.FastaParser import FastaParser\n+\n+\n+## Static methods for fasta file manipulation\n+#\n+class FastaUtils( object ):\n+ \n+ ## Count the number of sequences in the input fasta file\n+ #\n+ # @param inFile name of the input fasta file\n+ #\n+ # @return integer number of sequences in the input fasta file\n+ #\n+ @staticmethod\n+ def dbSize( inFile ):\n+ nbSeq = 0\n+ inFileHandler = open( inFile, "r" )\n+ line = inFileHandler.readline()\n+ while line:\n+ if line[0] == ">":\n+ nbSeq = nbSeq + 1\n+ line = inFileHandler.readline()\n+ inFileHandler.close()\n+ \n+ return nbSeq\n+ \n+ \n+ ## Compute the cumulative sequence length in the input fasta file\n+ #\n+ # @param inFile handler of the input fasta file\n+ #\n+ @staticmethod\n+ def dbCumLength( inFile ):\n+ cumLength = 0\n+ line = inFile.readline()\n+ while line:\n+ if line[0] != ">":\n+ cumLength += len(string.rstrip(line))\n+ line = inFile.readline()\n+ \n+ return cumLength\n+ \n+ \n+ ## Return a list with the length of each sequence in the input fasta file\n+ #\n+ # @param inFile string name of the input fasta file\n+ #\n+ @staticmethod\n+ def dbLengths( inFile ):\n+ lLengths = []\n+ inFileHandler = open( inFile, "r" )\n+ currentLength = 0\n+ line = inFileHandler.readline()\n+ while line:\n+ if line[0] == ">":\n+ if currentLength != 0:\n+ lLengths.append( currentLength )\n+ currentLength = 0\n+ else:\n+ currentLength += len(line[:-1])\n+ line = inFileHandler.readline()\n+ lLengths.append( currentLength )\n+ inFileHandler.close()\n+ return lLengths\n+ \n+ \n+ '..b'f:\n+ line = f.readline()\n+ while line:\n+ lineWithoutLastChar = line.rstrip()\n+ lHeaders = lineWithoutLastChar.split("\\t")\n+ clusterId += 1\n+ if verbosity > 0:\n+ print "%i sequences in cluster %i" % (len(lHeaders), clusterId)\n+ memberId = 0\n+ for header in lHeaders:\n+ memberId += 1\n+ dHeader2ClusterClusterMember[header] = (clusterId, memberId)\n+ line = f.readline()\n+ if verbosity > 0:\n+ print "%i clusters" % clusterId\n+ return dHeader2ClusterClusterMember, clusterId\n+ \n+ @staticmethod\n+ def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):\n+ """\n+ Write a map file from fasta output of clustering tool.\n+ Warning: only works if input fasta headers are formated like LTRharvest fasta output.\n+ """\n+ if not outMapFileName:\n+ outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])\n+ \n+ fileDb = open(fastaFileNameFromClustering , "r")\n+ fileMap = open(outMapFileName, "w")\n+ seq = Bioseq()\n+ numseq = 0\n+ while 1:\n+ seq.read(fileDb)\n+ if seq.sequence == None:\n+ break\n+ numseq = numseq + 1\n+ ID = seq.header.split(\' \')[0].split(\'_\')[0]\n+ chunk = seq.header.split(\' \')[0].split(\'_\')[1]\n+ start = seq.header.split(\' \')[-1].split(\',\')[0][1:]\n+ end = seq.header.split(\' \')[-1].split(\',\')[1][:-1]\n+ line = \'%s\\t%s\\t%s\\t%s\' % (ID, chunk, start, end)\n+ fileMap.write(line + "\\n")\n+ \n+ fileDb.close()\n+ fileMap.close()\n+ print "saved in %s" % outMapFileName\n+\n+ @staticmethod\n+ def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):\n+ outFormat = outFormat.lower()\n+ if outFormat in ["gff", "gff3"]:\n+ outFormat = "gff3"\n+ else:\n+ outFormat = "map"\n+ \n+ lTNstretches = []\n+ if nbN != 0:\n+ iBSDB = BioseqDB(fastaFileName)\n+ for iBS in iBSDB.db:\n+ nbNFound = 0\n+ start = 1\n+ pos = 1\n+ lastPos = 0\n+ \n+ while pos <= iBS.getLength():\n+ if nbNFound == 0:\n+ start = pos\n+ \n+ while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in [\'n\', \'x\']:\n+ nbNFound += 1\n+ pos += 1\n+ lastPos = pos\n+ \n+ if pos - lastPos >= nbN:\n+ if nbNFound >= nbN:\n+ lTNstretches.append((iBS.getHeader(), start, lastPos - 1))\n+ nbNFound = 0\n+ pos += 1\n+ \n+ if nbNFound >= nbN:\n+ lTNstretches.append((iBS.getHeader(), start, lastPos - 1))\n+ \n+ lTNstretches.sort(key = itemgetter(0, 1, 2))\n+ \n+ if outFileName == "":\n+ outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)\n+ \n+ with open(outFileName, "w") as fH:\n+ if outFormat == "gff3":\n+ fH.write("##gff-version 3\\n")\n+ for item in lTNstretches:\n+ seq = item[0]\n+ start = item[1]\n+ end = item[2]\n+ if outFormat == "gff3":\n+ fH.write("%s\\tFastaUtils\\tN_stretch\\t%s\\t%s\\t.\\t.\\t.\\tName=N_stretch_%s-%s\\n" % (seq, start, end, start, end))\n+ else:\n+ fH.write("N_stretch\\t%s\\t%s\\t%s\\n" % (seq, start, end))\n+ \n+ \n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/utils/FileUtils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/FileUtils.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,445 @@\n+# Copyright INRA (Institut National de la Recherche Agronomique)\n+# http://www.inra.fr\n+# http://urgi.versailles.inra.fr\n+#\n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use, \n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info". \n+#\n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability. \n+#\n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or \n+# data to be ensured and, more generally, to use and operate it in the \n+# same conditions as regards security. \n+#\n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+\n+\n+import os\n+import glob\n+import shutil\n+import sys\n+import re\n+import math\n+try:\n+ import hashlib\n+except:\n+ pass\n+\n+\n+class FileUtils( object ):\n+ \n+ ## Return the number of lines in the given file\n+ #\n+ def getNbLinesInSingleFile( fileName ):\n+ fileHandler = open( fileName, "r" )\n+ lines = fileHandler.readlines()\n+ fileHandler.close()\n+ if (len(lines)>0 and lines[-1]== "\\n"):\n+ return (len(lines)-1)\n+ else :\n+ return len(lines)\n+ \n+ getNbLinesInSingleFile = staticmethod( getNbLinesInSingleFile )\n+ \n+ ## Return the number of lines in the files in the given list\n+ #\n+ def getNbLinesInFileList( lFileNames ):\n+ count = 0\n+ for fileName in lFileNames:\n+ count += FileUtils.getNbLinesInSingleFile( fileName )\n+ return count\n+ \n+ getNbLinesInFileList = staticmethod( getNbLinesInFileList )\n+ \n+ ## Return True if the given file exists, False otherwise\n+ #\n+ def isRessourceExists( fileName ):\n+ return os.path.exists( fileName )\n+ \n+ isRessourceExists = staticmethod( isRessourceExists )\n+ \n+ ## Return True if the given file is empty, False otherwise\n+ #\n+ def isEmpty( fileName ):\n+ return 0 == FileUtils.getNbLinesInSingleFile( fileName )\n+ \n+ isEmpty = staticmethod( isEmpty )\n+ \n+ ## Return True if both files are identical, False otherwise\n+ #\n+ def are2FilesIdentical( file1, file2 ):\n+ tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) )\n+ cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile )\n+ returnStatus = os.system( cmd )\n+ if returnStatus != 0:\n+ print "WARNING: \'diff\' returned \'%i\'" % returnStatus\n+ os.remove( tmpFile )\n+ return False\n+ if FileUtils.isEmpty( tmpFile ):\n+ os.remove( tmpFile )\n+ return True\n+ else:\n+ os.remove( tmpFile )\n+ return False\n+ \n+ are2FilesIdentical = staticmethod( are2FilesIdentical )\n+ \n+ ## Return a string with all the content of the files in the given list\n+ #\n+ def getFileContent( lFiles ):\n+ content = ""\n+ lFiles.sort()\n+ for fileName in lFiles:\n+ currentFile = open( fileName, "r" )\n+ content += currentFile.re'..b' \n+ ## Give the list of file names found in the given directory\n+ #\n+ # @param dirPath string absolute path of the given directory\n+ #\n+ # @return lFilesInDir list of file names\n+ #\n+ def getFileNamesList( dirPath, patternFileFilter = ".*" ):\n+ lFilesInDir = []\n+ lPaths = glob.glob( dirPath + "/*" )\n+ for ressource in lPaths:\n+ if os.path.isfile( ressource ):\n+ fileName = os.path.basename( ressource )\n+ if re.match(patternFileFilter, fileName):\n+ lFilesInDir.append( fileName )\n+ return lFilesInDir\n+ \n+ getFileNamesList = staticmethod( getFileNamesList )\n+ \n+ ## Return the MD5 sum of a file\n+ #\n+ def getMd5SecureHash( inFile ):\n+ if "hashlib" in sys.modules:\n+ md5 = hashlib.md5()\n+ inFileHandler = open( inFile, "r" )\n+ while True:\n+ line = inFileHandler.readline()\n+ if line == "":\n+ break\n+ md5.update( line )\n+ inFileHandler.close()\n+ return md5.hexdigest()\n+ else:\n+ return ""\n+ \n+ getMd5SecureHash = staticmethod( getMd5SecureHash )\n+ \n+ ## Cat all files of a given directory\n+ #\n+ # @param dir string directory name\n+ # @param outFileName string output file name\n+ #\n+ def catFilesOfDir(dir, outFileName):\n+ lFiles = FileUtils.getFileNamesList(dir)\n+ lFile2 = []\n+ for file in lFiles:\n+ lFile2.append(dir + "/" + file)\n+ FileUtils.catFilesFromList(lFile2, outFileName)\n+ \n+ catFilesOfDir = staticmethod(catFilesOfDir)\n+ \n+ ## Return True if size file > 0 octet\n+ #\n+ # @param fileName string file name\n+ #\n+ def isSizeNotNull(fileName):\n+ size = os.path.getsize(fileName)\n+ if size > 0:\n+ return True\n+ return False\n+ \n+ isSizeNotNull = staticmethod(isSizeNotNull)\n+ \n+ ## Split one file into N Files by lines\n+ #\n+ # @param fileName string file name\n+ # @param N int number of files to create\n+ # \n+ @staticmethod\n+ def splitFileIntoNFiles(fileName, N):\n+ nbLine = FileUtils.getNbLinesInSingleFile(fileName)\n+ nbLinesInEachFile = nbLine\n+ if N > nbLine:\n+ N = nbLine\n+ if N != 0:\n+ nbLinesInEachFile = math.ceil(float(nbLine) / N)\n+ else:\n+ N = 1\n+ filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))\n+ fileHandler = open(fileName, "r")\n+ for i in range(1,N+1):\n+ with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f:\n+ j = 0\n+ while j < nbLinesInEachFile:\n+ j += 1\n+ f.write(fileHandler.readline())\n+ fileHandler.close() \n+ \n+ ## Split one file into files of N lines\n+ #\n+ # @param fileName string input file name\n+ # @param N int lines number per files\n+ # \n+ @staticmethod\n+ def splitFileAccordingToLineNumber(fileName, N):\n+ filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))\n+ with open(fileName) as inF:\n+ fileNb = 1\n+ line = inF.readline()\n+ if not line or N == 0:\n+ outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)\n+ f = open(outFileName, "wb")\n+ shutil.copyfileobj(open(fileName, "rb"), f)\n+ f.close()\n+ else:\n+ while line:\n+ outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)\n+ with open(outFileName, "w") as outF:\n+ lineNb = 1\n+ while lineNb <= N and line:\n+ outF.write(line)\n+ line = inF.readline()\n+ lineNb += 1\n+ fileNb += 1\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/utils/PipelineStepFTests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/PipelineStepFTests.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,83 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +import sys +import os +import shutil +from commons.core.utils.FileUtils import FileUtils + +class PipelineStepFTests(object): + + def __init__(self, pipelineName, packageDir, workingDir, projectName, config = "", clean = True): + self._pipelineName = pipelineName + self._packageDir = packageDir + self._workingDir = workingDir + self._projectName = projectName + self._clean = clean + self._configFileName = config + + def run(self): + self.launchStep() + self.assertStep() + +# def replaceInFile(self, fileName, oldPattern, newPattern, newFileName = ""): +# if newFileName == "": +# newFileName = "%s.new" % fileName +# f = open(newFileName, "w") +# for line in fileinput.input(fileName, inplace=1): +# newLine = line.replace(oldPattern, newPattern) +# f.write(newLine) +# f.close() +# fileinput.close() + + def _checkIfFileExist(self, fileName): + if not FileUtils.isRessourceExists(fileName): + print "%s do not exists\n" % fileName + return False + return True + + def _printMessageAndClean(self, msg): + print "%s in %s functional test\n" % (msg, self._pipelineName) + sys.stdout.flush() + os.chdir("../") + if self._clean: + shutil.rmtree(self._workingDir) + + def _areTwoFilesIdenticalByScript( self, expFileName, obsFileName, scriptName): + cmd = "%s -v 1 -r %s -t %s 2>/dev/null" % (scriptName, expFileName, obsFileName) + log = os.system(cmd) + sys.stdout.flush() + if log != 0: + return False + else: + return True \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/utils/RepetConfigParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/RepetConfigParser.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,38 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from ConfigParser import ConfigParser + + +class RepetConfigParser(ConfigParser): + + def optionxform(self, optionstr): + return optionstr \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/utils/RepetOptionParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/RepetOptionParser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,79 @@ +#!/usr/bin/env python + +""" +Class overriding optparse.OptionParser default epilog formatter. +The resulting epilog display format is the same as if the corresponding string was printed. +""" + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from optparse import OptionParser +from optparse import BadOptionError +from optparse import OptionValueError +SUPPRESS_USAGE = "SUPPRESS"+"USAGE" + +class RepetOptionParser(OptionParser): + + def parse_args(self, args=None, values=None): + rargs = self._get_args(args) + if not rargs: + rargs = ["-h"] + if values is None: + values = self.get_default_values() + self.rargs = rargs + self.largs = largs = [] + self.values = values + try: + self._process_args(largs, rargs, values) + except (BadOptionError, OptionValueError), err: + self.error(str(err)) + args = largs + rargs + return self.check_values(values, args) + + def set_usage(self, usage): + if not usage or usage is SUPPRESS_USAGE: + self.usage = None + elif usage.lower().startswith("usage: "): + self.usage = usage[7:] + else: + self.usage = usage + + def format_epilog(self, formatter): + if self.epilog != None: + return self.epilog + else : + return "" + + def format_description(self, formatter): + if self.description != None: + return self.description + else : + return "" |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/BedWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/BedWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,100 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class BedWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with BED format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "track name=reads description=\"Reads\" useScore=0 visibility=full offset=0\n" + super(BedWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["bed"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "bed" + + + def setTitle(self, title): + """ + Set the title of the track + @param title: the title of the track + @type title: string + """ + if title != None: + self.header = "track name=%s description=\"%s\" useScore=0 visibility=full offset=0\n" % (title, title) + + + def copyProperties(self, bedParser): + """ + Copy the properties collected by a parser, to produce a similar output + @param bedParser: a BED Parser parser + @type bedParser: class L{BedParser<BedParser>} + """ + self.setTitle(bedParser.title) + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + return transcript.printBed() + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/CsvWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/CsvWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,153 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from commons.core.writer.TranscriptListWriter import TranscriptListWriter +from SMART.Java.Python.misc.Progress import Progress + +class CsvWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with CSV (Excel) format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(CsvWriter, self).__init__(fileName, verbosity) + self.header = "" + self.title = "chromosome,start,end,strand,exons,tags\n" + self.modified = False + + + def __del__(self): + """ + Destructor + (Trick to write 1 tag per column) + """ + if self.handle != None: + self.modifyCsv() + super(CsvWriter, self).__del__() + + + def close(self): + if self.handle != None: + self.modifyCsv() + super(CsvWriter, self).close() + + + def modifyCsv(self): + """ + Clean CSV file so that there is one column per tag + """ + if self.modified: + return + + # read all the tags + self.handle.close() + self.handle = open(self.fileName) + nbFirstFields = 5 + tags = set() + if self.verbosity >= 10: + print "Modifying CSV file..." + number = -1 + for number, line in enumerate(self.handle): + if number != 0: + theseTags = line.strip().split(",")[nbFirstFields:] + for tag in theseTags: + if tag.find("=") != -1: + (key, value) = tag.split("=", 1) + if value != None: + tags.add(key) + if self.verbosity >= 10: + print " ...done" + + # re-write the file + tmpFileName = "tmpFile%d.csv" % (random.randint(0, 100000)) + tmpFile = open(tmpFileName, "w") + self.handle.seek(0) + progress = Progress(number + 1, "Re-writting CSV file", self.verbosity) + tmpFile.write(self.title.replace("tags", ",".join(sorted(tags)))) + for line in self.handle: + tagValues = dict([(key, None) for key in tags]) + tmpFile.write(",".join(line.strip().split(",")[:nbFirstFields])) + for tag in line.strip().split(",")[nbFirstFields:]: + if tag.find("=") != -1: + key = tag.split("=", 1)[0] + tagValues[key] = tag.split("=", 1)[1] + else: + tagValues[key] += ";%s" % (tag) + for key in sorted(tagValues.keys()): + tmpFile.write(",%s" % (tagValues[key])) + tmpFile.write("\n") + progress.inc() + tmpFile.close() + + # replace former file + import shutil + shutil.move(tmpFile.name, self.fileName) + progress.done() + self.modified = True + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["csv", "xls", "excel"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "csv" + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + return transcript.printCsv() + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/EmblWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/EmblWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,116 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class EmblWriter(TranscriptListWriter): + """ + A class that writes a transcript list into several files with EMBL format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.handles = {} + self.handle = None + + + def __del__(self): + """ + Destructor + Trick to append the sequences at the end of the EMBL files + """ + handle = open(self.sequenceFileName) + currentHandle = None + for line in handle: + if line[0] == ">": + chromosome = line[1:].strip() + if chromosome in self.handles: + currentHandle = self.handles[chromosome] + else: + currentHandle = None + else: + if currentHandle != None: + currentHandle.write(line) + handle.close() + for handle in self.handles.values(): + handle.close() + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["embl"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "embl" + + + def addTranscript(self, transcript): + """ + Add a transcript to the list of transcripts to be written + @param transcript: transcript to be written + @type transcript: class L{Transcript<Transcript>} + """ + chromosome = transcript.getChromosome() + if chromosome not in self.handles: + self.handles[chromosome] = open("%s%s.embl" % (self.fileName[:-len(".embl")], chromosome.title()), "w") + self.handles[chromosome].write(self.printTranscript(transcript)) + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + return transcript.printEmbl() + + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/FastaWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/FastaWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,77 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.SequenceListWriter import SequenceListWriter + + +class FastaWriter(SequenceListWriter): + """ + A class that writes a sequence list into a file with FASTA format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(FastaWriter, self).__init__(fileName, verbosity) + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["fasta", "mfa"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "fasta" + + + def getLine(self, sequence): + """ + Convert a sequence + @param sequence: sequence to be written + @type sequence: class L{Sequence<Sequence>} + """ + return sequence.printFasta() |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/FastqWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/FastqWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,78 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.SequenceListWriter import SequenceListWriter + + +class FastqWriter(SequenceListWriter): + """ + A class that writes a sequence list into a file with FASTQ format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(FastqWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["fastq", "mfq"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "fastq" + + + def getLine(self, sequence): + """ + Convert a sequence + @param sequence: sequence to be written + @type sequence: class L{Sequence<Sequence>} + """ + return sequence.printFastq() |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/GbWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/GbWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,102 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class GbWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GBrowse format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "[READS]\nbgcolor = red\nstrand_arrow = 1\n\n" + super(GbWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gb", "gbrowse"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gb" + + + def setColor(self, color): + """ + Set the color of the track + @param color: the color of the track + @type color: string + """ + if color != None: + self.header = "[READS]\nbgcolor= %s\nstrand_arrow = 1\n\n" % (color) + + + def copyProperties(self, gbParser): + """ + Copy the properties collected by a parser, to produce a similar output + @param gbParser: a GBrowse parser + @type gbParser: class L{GbParser<GbParser>} + """ + self.setColor(gbParser.color) + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + Possibly skip the reference if already put + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + if self.lastChromosome != None and self.lastChromosome == transcript.getChromosome(): + return transcript.printGBrowseLine() + return transcript.printGBrowse() |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/Gff2Writer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/Gff2Writer.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,89 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class Gff2Writer(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GFF2 format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = "" + super(Gff2Writer, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gff2"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gff2" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + return transcript.printGff2(self.title) + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/Gff3Writer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/Gff3Writer.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,130 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class Gff3Writer(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GFF3 format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0, title="S-MART", feature="transcript", featurePart="exon"): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = title + self.feature = feature + self.featurePart = featurePart + super(Gff3Writer, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gff3", "gff"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gff3" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + def setFeature(self, feature): + """ + Set the name of the feature + @param title: the title of the feature + @type feature: string + """ + self.feature = feature + + def setFeaturePart(self, featurePart): + """ + Set the name of the feature part + @param title: the title of the feature part + @type featurePart: string + """ + self.featurePart = featurePart + + + def printTranscript(self, transcript): + """ + Export the given transcript with GFF2 format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + direction = "+" + if transcript.getDirection() == -1: + direction = "-" + transcript.sortExonsIncreasing() + if "ID" not in transcript.getTagValues(): + transcript.setTagValue("ID", transcript.getUniqueName()) + feature = self.feature + tags = transcript.tags + if "feature" in transcript.getTagNames(): + feature = transcript.getTagValue("feature") + del transcript.tags["feature"] + score = "." + if "score" in transcript.getTagNames(): + score = "%d" % (int(transcript.getTagValue("score"))) + del transcript.tags["score"] + comment = transcript.getTagValues(";", "=") + string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (transcript.getChromosome(), self.title, feature, transcript.getStart(), transcript.getEnd(), score, direction, comment) + if len(transcript.exons) > 1: + for i, exon in enumerate(transcript.getExons()): + if "score" in exon.getTagNames(): + score = "%d" % (int(exon.getTagValue("score"))) + string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tID=%s-%s%d;Name=%s-%s%d;Parent=%s\n" % (transcript.getChromosome(), self.title,self.featurePart, exon.getStart(), exon.getEnd(), score, direction, transcript.getTagValue("ID"),self.featurePart, i+1, transcript.name,self.featurePart, i+1, transcript.getTagValue("ID")) + self.tags = tags + return string + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/GtfWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/GtfWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,89 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class GtfWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GTF format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = "S-MART" + super(GtfWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["gtf", "gtf2"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "gtf" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + + def printTranscript(self, transcript): + """ + Export the given transcript with GTF format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + return transcript.printGtf(self.title) + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/MapWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/MapWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,100 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class MapWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with GFF3 format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + """ + + + def __init__(self, fileName, verbosity = 0, title="S-MART"): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.header = "" + self.title = title + TranscriptListWriter.__init__(self, fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["map"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "map" + + + def setTitle(self, title): + """ + Set the title of the transcripts + @param title: the title of the transcripts + @type title: string + """ + self.title = title + + + def printTranscript(self, transcript): + """ + Export the given transcript to map format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + name = transcript.name + if "nbOccurrences" in transcript.getTagNames() and transcript.getTagValue("nbOccurrences") != 1 and transcript.getTagValue("occurrences"): + name = "%s-%d" % (name, transcript.getTagValue("occurrence")) + sizes = [] + starts = [] + transcript.sortExonsIncreasing() + for exon in transcript.getExons(): + sizes.append("%d" % (exon.getSize())) + starts.append("%d" % (exon.getStart() - transcript.getStart())) + return "%s\t%s\t%d\t%d\n" % (name, transcript.getChromosome(), transcript.getStart(), transcript.getEnd()+1) + + + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/MySqlTranscriptWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/MySqlTranscriptWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
b'@@ -0,0 +1,215 @@\n+#\n+# Copyright INRA-URGI 2009-2010\n+# \n+# This software is governed by the CeCILL license under French law and\n+# abiding by the rules of distribution of free software. You can use,\n+# modify and/ or redistribute the software under the terms of the CeCILL\n+# license as circulated by CEA, CNRS and INRIA at the following URL\n+# "http://www.cecill.info".\n+# \n+# As a counterpart to the access to the source code and rights to copy,\n+# modify and redistribute granted by the license, users are provided only\n+# with a limited warranty and the software\'s author, the holder of the\n+# economic rights, and the successive licensors have only limited\n+# liability.\n+# \n+# In this respect, the user\'s attention is drawn to the risks associated\n+# with loading, using, modifying and/or developing or reproducing the\n+# software by the user in light of its specific status of free software,\n+# that may mean that it is complicated to manipulate, and that also\n+# therefore means that it is reserved for developers and experienced\n+# professionals having in-depth computer knowledge. Users are therefore\n+# encouraged to load and test the software\'s suitability as regards their\n+# requirements in conditions enabling the security of their systems and/or\n+# data to be ensured and, more generally, to use and operate it in the\n+# same conditions as regards security.\n+# \n+# The fact that you are presently reading this means that you have had\n+# knowledge of the CeCILL license and that you accept its terms.\n+#\n+import os\n+import random\n+from SMART.Java.Python.mySql.MySqlTable import MySqlTable\n+from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable\n+from SMART.Java.Python.misc.Progress import Progress\n+\n+class MySqlTranscriptWriter(object):\n+ """\n+ A class that writes a transcript list into a mySQL table\n+ @ivar name: name of the tables \n+ @type name: string\n+ @ivar tables: the tables\n+ @type tables: dict of L{MySqlTranscriptTable<MySqlTranscriptTable>}\n+ @ivar mySqlConnection: connection to a MySQL database\n+ @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}\n+ @ivar tmpTranscriptFileHandles: files where transcripts are temporary stored, before copy into database\n+ @type tmpTranscriptFileHandles: dict of file handles\n+ @ivar nbTranscriptsByChromosome: number of transcripts written\n+ @type nbTranscriptsByChromosome: dict of int (one for each chromosome)\n+ @ivar randomNumber: a random number, used for having a unique name for the tables\n+ @type randomNumber: int\n+ @ivar toBeWritten: there exists transcripts to be copied into database\n+ @type toBeWritten: bool \n+ @ivar verbosity: verbosity\n+ @type verbosity: int \n+ """\n+\n+\n+ def __init__(self, connection, name = None, verbosity = 0):\n+ """\n+ Constructor\n+ @param name: name of the file \n+ @type name: string\n+ @param verbosity: verbosity\n+ @type verbosity: int\n+ """\n+ self.name = name\n+ self.verbosity = verbosity\n+ self.tables = {}\n+ self.indices = {}\n+ self.tmpTranscriptFileHandles = {}\n+ self.nbTranscriptsByChromosome = {}\n+ self.toBeWritten = False\n+ self.randomNumber = random.randint(0, 100000)\n+ self.mySqlConnection = connection\n+ self.nbTmpFiles = 100\n+ self.transcriptValues = {}\n+ self.nbTranscriptValues = 1000\n+ if self.name != None:\n+ pos = self.name.rfind(os.sep)\n+ if pos != -1:\n+ self.name = self.name[pos+1:]\n+ \n+\n+ def __del__(self):\n+ '..b'+ if chromosome not in self.transcriptValues:\n+ self.transcriptValues[chromosome] = []\n+ \n+ self.transcriptValues[chromosome].append(transcript.getSqlValues())\n+\n+ self.nbTranscriptsByChromosome[chromosome] += 1\n+ self.toBeWritten = True\n+ if sum([len(transcripts) for transcripts in self.transcriptValues.values()]) > self.nbTranscriptValues:\n+ self.write() \n+\n+\n+ def addElement(self, element):\n+ """\n+ Same as "addTranscript"\n+ @param element: transcript to be written\n+ @type element: class L{Transcript<Transcript>}\n+ """\n+ self.addTranscript(element)\n+\n+\n+# def addTranscriptList(self, transcriptListParser):\n+# """\n+# Add a list of transcripts to the transcripts to be written\n+# @param transcriptListParser: transcripts to be written\n+# @type transcriptListParser: class L{TranscriptListParser<TranscriptListParser>}\n+# """\n+# progress = Progress(transcriptListParser.getNbTranscripts(), "Storing %s into database" % (transcriptListParser.fileName), self.verbosity)\n+# for transcript in transcriptListParser.getIterator():\n+# self.addTranscript(transcript)\n+# progress.inc()\n+# progress.done()\n+ \n+ \n+ def addTranscriptList(self, transcriptListParser):\n+ """\n+ Add a list of transcripts to the transcripts to be written\n+ @param transcriptListParser: transcripts to be written\n+ @type transcriptListParser: class L{TranscriptListParser<TranscriptListParser>}\n+ """\n+ self.transcriptListParser = transcriptListParser\n+ self.mySqlConnection.executeManyFormattedQueriesIterator(self)\n+ \n+ \n+ def getIterator(self):\n+ """\n+ Iterator to the SQL commands to insert the list\n+ """\n+ progress = Progress(self.transcriptListParser.getNbTranscripts(), "Storing %s into database" % (self.transcriptListParser.fileName), self.verbosity)\n+ for transcript in self.transcriptListParser.getIterator():\n+ chromosome = transcript.getChromosome()\n+ if chromosome not in self.tables:\n+ self.createTable(chromosome)\n+ self.nbTranscriptsByChromosome[chromosome] = self.nbTranscriptsByChromosome.get(chromosome, 0) + 1\n+ values = transcript.getSqlValues()\n+ #yield "INSERT INTO \'%s\' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join([MySqlTable.formatSql(values[variable], self.tables[chromosome].types[variable], self.tables[chromosome].sizes[variable]) for variable in self.tables[chromosome].variables]))\n+ yield ("INSERT INTO \'%s\' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join(["?"] * len(self.tables[chromosome].variables))), [values[variable] for variable in self.tables[chromosome].variables])\n+ progress.inc()\n+ progress.done()\n+ \n+ \n+ def write(self):\n+ """\n+ Copy the content of the files into the database\n+ (May add transcripts to already created databases)\n+ """\n+ for chromosome in self.transcriptValues:\n+ if chromosome in self.transcriptValues:\n+ self.tables[chromosome].insertManyFormatted(self.transcriptValues[chromosome])\n+ self.transcriptValues = {}\n+ self.toBeWritten = False\n+ \n+ \n+ def getTables(self):\n+ """\n+ Get the tables\n+ @return: the mySQL tables\n+ """\n+ if self.toBeWritten:\n+ self.write()\n+ return self.tables\n+\n+ \n+ \n+ def removeTables(self):\n+ """\n+ Drop the tables\n+ """\n+ for chromosome in self.tables:\n+ self.tables[chromosome].remove()\n\\ No newline at end of file\n' |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/SamWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/SamWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,101 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import random +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class SamWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with SAM format + @ivar sizes: estimated sizes of the chromosomes + @type sizes: dict of string to int + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(SamWriter, self).__init__(fileName, verbosity) + self.sizes = {} + self.headerWritten = False + + + def close(self): + """ + Close file (trick to add header) + """ + super(SamWriter, self).close() + if self.headerWritten: + return + tmpFileName = "tmpFile%d.sam" % (random.randint(0, 100000)) + tmpHandle = open(tmpFileName, "w") + for chromosome, size in self.sizes.iteritems(): + tmpHandle.write("@SQ\tSN:%s\tLN:%d\n" % (chromosome, size)) + self.handle = open(self.fileName) + for line in self.handle: + tmpHandle.write(line) + tmpHandle.close() + self.handle.close() + os.rename(tmpFileName, self.fileName) + self.headerWritten = True + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["sam"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "sam" + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + self.sizes[transcript.getChromosome()] = max(transcript.getEnd(), self.sizes.get(transcript.getChromosome(), 0)) + return transcript.printSam() + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/SequenceListWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/SequenceListWriter.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,94 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# + +class SequenceListWriter(object): + """ + An interface that writes a list of sequences into a file + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.handle = open(self.fileName, "w") + + + def __del__(self): + """ + Destructor + """ + self.close() + + + def write(self): + """ + No-op + """ + pass + + + def close(self): + """ + Close writer + """ + if self.handle != None: + self.handle.close() + + + def addSequence(self, sequence): + """ + Add a sequence to the list of sequence to be written + @param sequence: sequence to be written + @type sequence: class L{Sequence<Sequence>} + """ + self.handle.write(self.getLine(sequence)) + + + def addElement(self, element): + """ + Same as "addSequence" + @param element: sequence to be written + @type element: class L{Sequence<Sequence>} + """ + self.addSequence(element) + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/TranscriptListWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/TranscriptListWriter.py Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,163 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from SMART.Java.Python.misc.Progress import Progress + +class TranscriptListWriter(object): + """ + An interface that writes a transcript list into a file + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + @ivar started: whether some transcripts have already been writted + @type started: boolean + @ivar lastChromosome: the chromosome on which the transcript which was inserted last + @type lastChromosome: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.handle = open(self.fileName, "w") + self.started = False + self.lastChromosome = None + self.header = "" + self.sequenceFileName = None + + + def __del__(self): + """ + Destructor + """ + self.close() + + + def close(self): + """ + Close writer + """ + if self.handle != None and not self.handle.closed: + self.handle.close() + self.handle = None + + + def addTranscript(self, transcript): + """ + Add a transcript to the list of transcripts to be written + @param transcript: transcript to be written + @type transcript: class L{Transcript<Transcript>} + """ + if not self.started: + self.handle.write(self.header) + self.started = True + + self.handle.write(self.printTranscript(transcript)) + self.lastChromosome = transcript.getChromosome() + + + def addElement(self, element): + """ + Same as "addTranscript" + @param element: transcript to be written + @type element: class L{Transcript<Transcript>} + """ + self.addTranscript(element) + + + def addTranscriptList(self, transcriptList): + """ + Add a list of transcripts to the transcripts to be written + @param transcriptList: transcripts to be written + @type transcriptList: class L{TranscriptList<TranscriptList>} + """ + progress = Progress(transcriptList.getNbTranscripts(), "Writing transcripts", self.verbosity) + for transcript in transcriptList.getIterator(): + self.addTranscript(transcript) + progress.inc() + progress.done() + + + def addTranscriptTable(self, transcriptTable): + """ + Add a list of transcripts in a mySQL table to the transcripts to be written + @param transcriptTable: transcripts to be written + @type transcriptTable: class L{MySqlTranscriptTable<MySqlTranscriptTable>} + """ + for transcript in transcriptTable.getIterator(): + self.addTranscript(transcript) + + + def setTitle(self, title): + """ + Possibly write a title for the list (by default, do nothing) + @param title: a title for the list + @type title: string + """ + pass + + def setFeature(self, feature): + """ + Set the name of the feature + @param title: the title of the feature + @type feature: string + """ + pass + + def setFeaturePart(self, featurePart): + """ + Set the name of the feature part + @param title: the title of the feature part + @type featurePart: string + """ + pass + + + def addSequenceFile(self, fileName): + """ + Get the multi-fasta file of the sequences + """ + self.sequenceFileName = fileName + + + def write(self): + """ + No-op + """ + pass |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/TranscriptWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/TranscriptWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,189 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os +import sys +from commons.core.writer.WriterChooser import WriterChooser +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + +class TranscriptWriter(object): + """ + An interface class that writes a list of transcripts, handle different formats + @ivar container: container of the data + @type container: L{TranscriptContainer<TranscriptContainer>} + @ivar format: format of the data to be printed + @type format: string + @ivar file: the file where to print + @type file: string + @ivar type: type of the data (transcripts, mappings or mySQL) + @type type: string + @ivar writer: a transcript list writer + @type writer: L{TranscriptListWriter<TranscriptListWriter>} or None + @ivar mode: use a container or enter transcript one by one + @type mode: string + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, file, format, verbosity = 0): + """ + Constructor + @param container: container of the data + @type container: string + @param format: format of the data + @type format: string + @param file: file where to print + @type file: string + @param verbosity: verbosity + @type verbosity: int + """ + self.container = None + self.format = format + self.file = file + + self.verbosity = verbosity + self.type = None + self.writer = None + self.mode = None + if self.format == None: + sys.exit("Error! Writer input format is empty!") + + if self.format == "sql": + self.type = "sql" + pos = self.file.rfind(os.sep) + if pos > -1: + self.file = self.file[pos+1:] + self.writer = MySqlTranscriptWriter(self.file, self.verbosity) + else: + writerChooser = WriterChooser(self.verbosity) + writerChooser.findFormat(self.format) + self.writer = writerChooser.getWriter(self.file) + self.type = writerChooser.getType() + + + def close(self): + """ + Close writer + """ + if self.writer != None: + self.writer.close() + + + def setContainer(self, container): + """ + Set a container for the data + @param container: container of the data + @type container: class L{TranscriptContainer<TranscriptContainer>} + """ + self.container = container + if self.mode == "transcript": + raise Exception("Error! TranscriptWriter '%s' on 'transcript' mode is currently used on 'container' mode." % (self.file)) + self.mode = "container" + + + def addTranscript(self, transcript): + """ + Add a transcript to write + @param transcript: a transcript + @type transcript: class L{Transcript<Transcript>} + """ + self.writer.addTranscript(transcript) + if self.mode == "container": + sys.exit("Error! TranscriptWriter '%s' on 'container' mode is currently used on 'transcript' mode." % (self.file)) + self.mode = "transcript" + + + def addElement(self, transcript): + """ + Same as addTranscript + """ + self.addTranscript(transcript) + + + def setTitle(self, title): + """ + Possibly write a title for the list + @param title: a title for the list + @type title: string + """ + if self.writer != None: + self.writer.setTitle(title) + + def setFeature(self, feature): + """ + Possibly Set the name of the feature + @param title: the title of the feature + @type feature: string + """ + if self.writer != None: + self.writer.setFeature(feature) + + def setFeaturePart(self, featurePart): + """ + Possibly Set the name of the feature part + @param title: the title of the feature part + @type featurePart: string + """ + if self.writer != None: + self.writer.setFeaturePart(featurePart) + + def setStrands(self, strands): + """ + Possibly consider both strands separately + @param strands: whether both strands should be considered separately + @type strands: boolean + """ + if self.writer != None: + self.writer.setStrands(strands) + + + def write(self): + """ + Write the content and possibly convert data + """ + if self.type == "transcript" or self.type == "sequence": + if self.mode == "container": + self.writer.addTranscriptList(self.container) + return + + if self.mode == "transcript" or self.type == "sequence": + self.writer.write() + return + + if self.container.format != "sql": + self.container.storeIntoDatabase() + tables = self.container.getTables() + for chromosome in tables: + tables[chromosome].rename("%s_%s" % (self.file, chromosome)) + return + + + def addSequenceFile(self, fileName): + self.writer.addSequenceFile(fileName) + \ No newline at end of file |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/UcscWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/UcscWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,73 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.BedWriter import BedWriter + +class UcscWriter(BedWriter): + """ + A class that writes a transcript list into a file with UCSC BED format (minor differences with BED format) + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + super(UcscWriter, self).__init__(fileName, verbosity) + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["ucsc"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "bed" + + + def printTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + return transcript.printUcsc() + |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/WigWriter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/WigWriter.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,139 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from commons.core.writer.TranscriptListWriter import TranscriptListWriter + + +class WigWriter(TranscriptListWriter): + """ + A class that writes a transcript list into a file with WIGGLE format + @ivar fileName: name of the file + @type fileName: string + @ivar handle: handle to the file + @type handle: file handle + @ivar header: first lines of the file + @type header: string + """ + + + def __init__(self, fileName, verbosity = 0): + """ + Constructor + @param fileName: name of the file + @type fileName: string + @param verbosity: verbosity + @type verbosity: int + """ + self.fileName = fileName + self.verbosity = verbosity + self.data = {-1: {}, 0: {}, 1: {}} + self.title = "Reads" + self.strands = False + self.handle = None + + + def __del__(self): + """ + Destructor + Actually print the file + """ + strand2string = {-1: "-", 1: "+", 0: ""} + self.handle = open(self.fileName, "w") + self.handle.write("track type=wiggle_0 name=\"%s\"\n" % (self.title)) + for strand in self.data: + for chromosome in sorted(self.data[strand]): + self.handle.write("variableStep chrom=%s%s\n" % (chromosome, strand2string[strand])) + for pos in sorted(self.data[strand][chromosome]): + self.handle.write("%d\t%d\n" % (pos, self.data[strand][chromosome][pos])) + self.handle.close() + + + @staticmethod + def getFileFormats(): + """ + Get the format of the file + """ + return ["wig", "wiggle"] + + + @staticmethod + def getExtension(): + """ + Get the usual extension for the file + """ + return "wig" + + + def setTitle(self, title): + """ + Set the title of the track + @param title: the title of the track + @type title: string + """ + if title != None: + self.title = title + + + def setStrands(self, strands): + """ + Consider each strand separately + @param boolean: whether each strand should be considered separately + @type boolean: boolean + """ + self.strands = strands + + + def copyProperties(self, parser): + """ + Copy the properties collected by a parser, to produce a similar output + @param bedParser: a parser + @type bedParser: class L{TranscriptListWriter<TranscriptListWriter>} + """ + self.setTitle(parser.title) + + + def addTranscript(self, transcript): + """ + Export the given transcript with GBrowse format + @param transcript: transcript to be printed + @type transcript: class L{Transcript<Transcript>} + @return: a string + """ + chromosome = transcript.getChromosome() + direction = transcript.getDirection() + if not self.strands: + direction = 0 + if chromosome not in self.data[direction]: + self.data[direction][chromosome] = {} + for exon in transcript.getExons(): + for pos in range(exon.getStart(), exon.getEnd()+1): + if pos not in self.data[direction][chromosome]: + self.data[direction][chromosome][pos] = 1 + else: + self.data[direction][chromosome][pos] += 1 |
b |
diff -r d94018ca4ada -r 44d5973c188c commons/core/writer/WriterChooser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/writer/WriterChooser.py Tue Apr 30 15:02:29 2013 -0400 |
[ |
@@ -0,0 +1,127 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +from commons.core.writer.TranscriptListWriter import TranscriptListWriter +from commons.core.writer.SequenceListWriter import SequenceListWriter +from commons.core.writer.BedWriter import BedWriter +from commons.core.writer.CsvWriter import CsvWriter +from commons.core.writer.EmblWriter import EmblWriter +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from commons.core.writer.GbWriter import GbWriter +from commons.core.writer.Gff2Writer import Gff2Writer +from commons.core.writer.SamWriter import SamWriter +from commons.core.writer.UcscWriter import UcscWriter +from commons.core.writer.WigWriter import WigWriter +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.GtfWriter import GtfWriter +from commons.core.writer.MapWriter import MapWriter + + +class WriterChooser(object): + """ + A class that finds the correct writer + @ivar type: transcript / sequence writer + @type type: string + @ivar format: the format of the writer + @type format: string + @ivar writerClass: the class of the writer + @type writerClass: string + @ivar extension: default extension of the file + @type extension: string + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, verbosity = 0): + """ + Constructor + @param verbosity: verbosity + @type verbosity: int + """ + self.type = None + self.format = None + self.writerClass = None + self.extension = None + self.verbosity = verbosity + + + def findFormat(self, format, type = None): + """ + Find the correct parser + @ivar format: the format + @type format: string + @ivar type: transcript sequence parser (None is all) + @type type: string + @return: a parser + """ + classes = {} + if (type == "transcript"): + classes = {TranscriptListWriter: "transcript"} + elif (type == "sequence"): + classes = {SequenceListWriter: "sequence"} + elif (type == None): + classes = {TranscriptListWriter: "transcript", SequenceListWriter: "sequence"} + else: + sys.exit("Do not understand format type '%s'" % (type)) + + for classType in classes: + for writerClass in classType.__subclasses__(): + if format in writerClass.getFileFormats(): + self.writerClass = writerClass + self.extension = writerClass.getExtension() + self.type = classes[classType] + return + sys.exit("Cannot get writer for format '%s'" % (format)) + + + def getWriter(self, fileName): + """ + Get the writer previously found + @return: the writer + """ + return self.writerClass(fileName, self.verbosity) + + + def getType(self): + """ + Get the type of writer previously found + @return: the type of writer + """ + return self.type + + + def getExtension(self): + """ + Get the default extension of writer previously found + @return: the extension + """ + return self.extension + |
b |
diff -r d94018ca4ada -r 44d5973c188c doc.pdf |
b |
Binary file doc.pdf has changed |
b |
diff -r d94018ca4ada -r 44d5973c188c tool_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_conf.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,48 @@ + <section id="s_mart" name="S-MART" version=""> + <label id="Smart_Comparison" text="Comparison Tools" version=""/> + <tool file="s_mart/CompareOverlappingSmallQuery.xml"/> + <tool file="s_mart/CompareOverlappingSmallRef.xml"/> + <tool file="s_mart/compareOverlapping.xml"/> + <tool file="s_mart/getDifference.xml"/> + <tool file="s_mart/computeCoverage.xml"/> + <tool file="s_mart/GetFlanking.xml"/> + <tool file="s_mart/GetDifferentialExpression.xml"/> + <label id="Smart_Merge" text="Merge Tools" version=""/> + <tool file="s_mart/clusterize.xml"/> + <tool file="s_mart/mergeTranscriptLists.xml"/> + <tool file="s_mart/CollapseReads.xml"/> + <tool file="s_mart/clusterizeBySlidingWindows.xml"/> + <tool file="s_mart/mergeSlidingWindowsClusters.xml"/> + <label id="Smart_Visualization" text="Visualization Tools" version=""/> + <tool file="s_mart/getDistribution.xml"/> + <tool file="s_mart/getDistance.xml"/> + <tool file="s_mart/getSizes.xml"/> + <tool file="s_mart/plotCoverage.xml"/> + <tool file="s_mart/WrappGetLetterDistribution1.xml"/> + <tool file="s_mart/plotTranscriptList.xml"/> + <label id="Smart_Sequence" text="Sequence Tools" version=""/> + <tool file="s_mart/CountReadGCPercent.xml"/> + <label id="Smart_Modification" text="Modification Tools" version=""/> + <tool file="s_mart/modifyGenomicCoordinates.xml"/> + <tool file="s_mart/modifySequenceList.xml"/> + <tool file="s_mart/trimSequences.xml"/> + <label id="Smart_Selection" text="Selection Tools" version=""/> + <tool file="s_mart/getExons.xml"/> + <tool file="s_mart/getIntrons.xml"/> + <tool file="s_mart/restrictFromSize.xml"/> + <tool file="s_mart/restrictTranscriptList.xml"/> + <label id="Smart_Conversion" text="Conversion Tools" version=""/> + <tool file="s_mart/ConvertTranscriptFile.xml"/> + <tool file="s_mart/coordinatesToSequence.xml"/> + <tool file="s_mart/mapperAnalyzer.xml"/> + <label id="Smart_WIG" text="WIG Manipulation Tools" version=""/> + <tool file="s_mart/getWigData.xml"/> + <tool file="s_mart/getWigDistance.xml"/> + <tool file="s_mart/getWigProfile.xml"/> + <label id="Smart_GFF" text="GFF Manipulation Tools" version=""/> + <tool file="s_mart/CleanTranscriptFile.xml"/> + <tool file="s_mart/changeTagName.xml"/> + <tool file="s_mart/changeGffFeatures.xml"/> + <tool file="s_mart/removeExonLines.xml"/> + <tool file="s_mart/SelectByTag.xml"/> + </section> |
b |
diff -r d94018ca4ada -r 44d5973c188c tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 30 15:02:29 2013 -0400 |
b |
@@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <set_environment version="1.0"> + <environment_variable name="PYTHONPATH" action="set_to">$REPOSITORY_INSTALL_DIR</environment_variable> + </set_environment> +</tool_dependency> |