| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 """Merge elements of two transcript lists with some condition""" | 
|  | 32 | 
|  | 33 import os, random, shutil, glob | 
|  | 34 from optparse import OptionParser | 
|  | 35 from commons.core.parsing.SequenceListParser import SequenceListParser | 
|  | 36 from commons.core.parsing.BedParser import BedParser | 
|  | 37 from commons.core.parsing.GffParser import GffParser | 
|  | 38 from commons.core.writer.TranscriptWriter import TranscriptWriter | 
|  | 39 from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator | 
|  | 40 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 41 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 42 from SMART.Java.Python.misc.Progress import Progress | 
|  | 43 | 
|  | 44 | 
|  | 45 | 
|  | 46 class MergeLists(object): | 
|  | 47 | 
|  | 48     def __init__(self, verbosity): | 
|  | 49         self.verbosity     = verbosity | 
|  | 50         self.seed          = random.randint(0, 100000) | 
|  | 51         self.aggregation   = False | 
|  | 52         self.normalization = False | 
|  | 53         self.distance      = False | 
|  | 54         self.antisense     = False | 
|  | 55         self.colinear      = False | 
|  | 56         self.fileNames     = {} | 
|  | 57         self.formats       = {} | 
|  | 58         self.tmpFileNames  = [] | 
|  | 59         self.logHandle     = None | 
|  | 60 | 
|  | 61 #    def __del__(self): | 
|  | 62 #        for fileNameRoot in self.tmpFileNames: | 
|  | 63 #            for fileName in glob.glob("%s*" % (fileNameRoot)): | 
|  | 64 #                os.remove(fileName) | 
|  | 65 #        if self.logHandle != None: | 
|  | 66 #            self.logHandle.close() | 
|  | 67 #            self.logHandle = None | 
|  | 68 | 
|  | 69     def setLogFileName(self, fileName): | 
|  | 70         self.logHandle = open(fileName, "w") | 
|  | 71 | 
|  | 72     def setInputFileName(self, fileName, format, id): | 
|  | 73         self.fileNames[id] = fileName | 
|  | 74         self.formats[id]   = format | 
|  | 75 | 
|  | 76     def setOutputFileName(self, fileName): | 
|  | 77         self.outputFileName = fileName | 
|  | 78 | 
|  | 79     def setAggregate(self, aggregation): | 
|  | 80         self.aggregation = aggregation | 
|  | 81 | 
|  | 82     def setNormalization(self, normalization): | 
|  | 83         self.normalization = normalization | 
|  | 84 | 
|  | 85     def setDistance(self, distance): | 
|  | 86         self.distance = distance | 
|  | 87 | 
|  | 88     def setAntisense(self, antisense): | 
|  | 89         self.antisense = antisense | 
|  | 90 | 
|  | 91     def setColinear(self, colinear): | 
|  | 92         self.colinear = colinear | 
|  | 93 | 
|  | 94     def createTmpFileName(self, root): | 
|  | 95         fileName = "tmp_%s_%d.gff3" % (root, self.seed) | 
|  | 96         self.tmpFileNames.append(fileName) | 
|  | 97         return fileName | 
|  | 98 | 
|  | 99     def selfMerge(self, fileName, format, outputFileName): | 
|  | 100         transcriptListComparator = TranscriptListsComparator(self.logHandle, self.verbosity) | 
|  | 101         transcriptListComparator.getColinearOnly(True) | 
|  | 102         transcriptListComparator.setNormalization(self.normalization) | 
|  | 103         transcriptContainer = TranscriptContainer(fileName, format, self.verbosity) | 
|  | 104         writer              = TranscriptWriter(outputFileName, "gff3", self.verbosity) | 
|  | 105         transcriptListComparator.setInputTranscriptContainer(transcriptListComparator.QUERY, transcriptContainer) | 
|  | 106         transcriptListComparator.setOutputWriter(writer) | 
|  | 107         transcriptListComparator.compareTranscriptListSelfMerge() | 
|  | 108 | 
|  | 109     def keepOverlapping(self, fileNames, formats, outputFileName): | 
|  | 110         transcriptListComparator = TranscriptListsComparator(self.logHandle, self.verbosity) | 
|  | 111         transcriptListComparator.getAntisenseOnly(self.antisense) | 
|  | 112         transcriptListComparator.getColinearOnly(self.colinear) | 
|  | 113         for i in (0, 1): | 
|  | 114             transcriptContainer = TranscriptContainer(fileNames[i], formats[i], self.verbosity) | 
|  | 115             transcriptListComparator.setInputTranscriptContainer(i, transcriptContainer) | 
|  | 116         transcriptListComparator.aggregate(self.aggregation) | 
|  | 117         transcriptListComparator.setNormalization(self.normalization) | 
|  | 118         transcriptListComparator.setMaxDistance(self.distance) | 
|  | 119         writer = TranscriptWriter(outputFileName, "gff3", self.verbosity) | 
|  | 120         transcriptListComparator.setOutputWriter(writer) | 
|  | 121         transcriptListComparator.compareTranscriptList() | 
|  | 122 | 
|  | 123     def mergeFiles(self, fileName1, fileName2, outputFileName): | 
|  | 124         outputFile = open(outputFileName, "w") | 
|  | 125         shutil.copyfileobj(open(fileName1, "r"), outputFile) | 
|  | 126         shutil.copyfileobj(open(fileName2, "r"), outputFile) | 
|  | 127         outputFile.close() | 
|  | 128 | 
|  | 129     def run(self): | 
|  | 130         selectedFileQuery = self.createTmpFileName("query") | 
|  | 131         self.keepOverlapping({0: self.fileNames[0], 1: self.fileNames[0]}, {0: "gff3", 1: "gff3"}, selectedFileQuery) | 
|  | 132         mergeFileTarget = self.createTmpFileName("target") | 
|  | 133         self.selfMerge(self.fileNames[1], self.formats[1], mergeFileTarget) | 
|  | 134         if not self.aggregation: | 
|  | 135             overlapFile = self.createTmpFileName("overlap") | 
|  | 136             self.keepOverlapping({0: mergeFileTarget, 1: selectedFileQuery}, {0: "gff3", 1: "gff3"}, overlapFile) | 
|  | 137             mergeFileTarget = overlapFile | 
|  | 138         mergeFileMerged = self.createTmpFileName("merged") | 
|  | 139         self.mergeFiles(mergeFileTarget, selectedFileQuery, mergeFileMerged) | 
|  | 140         self.selfMerge(mergeFileMerged, "gff3", self.outputFileName) | 
|  | 141 | 
|  | 142 | 
|  | 143 | 
|  | 144 if __name__ == "__main__": | 
|  | 145 | 
|  | 146     # parse command line | 
|  | 147     description = "Merge Lists v1.0.3: Merge the elements of two lists of genomic coordinates. [Category: Merge]" | 
|  | 148 | 
|  | 149     parser = OptionParser(description = description) | 
|  | 150     parser.add_option("-i", "--input1",    dest="inputFileName1", action="store",                       type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") | 
|  | 151     parser.add_option("-f", "--format1",   dest="format1",        action="store",                       type="string", help="format of file 1 [compulsory] [format: transcript file format]") | 
|  | 152     parser.add_option("-j", "--input2",    dest="inputFileName2", action="store",      default=None,    type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") | 
|  | 153     parser.add_option("-g", "--format2",   dest="format2",        action="store",      default=None,    type="string", help="format of file 2 [compulsory] [format: file in transcript format]") | 
|  | 154     parser.add_option("-o", "--output",    dest="outputFileName", action="store",      default=None,    type="string", help="output file [compulsory] [format: output file in GFF3 format]") | 
|  | 155     parser.add_option("-k", "--all",       dest="all",            action="store_true", default=False,                  help="print all the transcripts, not only those overlapping [format: bool] [default: false]") | 
|  | 156     parser.add_option("-d", "--distance",  dest="distance",       action="store",      default=0,       type="int",    help="max. distance between two transcripts [format: int] [default: 0]") | 
|  | 157     parser.add_option("-a", "--antisense", dest="antisense",      action="store_true", default=False,                  help="antisense only [format: bool] [default: false]") | 
|  | 158     parser.add_option("-c", "--colinear",  dest="colinear",       action="store_true", default=False,                  help="colinear only [format: bool] [default: false]") | 
|  | 159     parser.add_option("-n", "--normalize", dest="normalize",      action="store_true", default=False,                  help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") | 
|  | 160     parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,       type="int",    help="trace level [format: int]") | 
|  | 161     (options, args) = parser.parse_args() | 
|  | 162 | 
|  | 163 #    ml = MergeLists(logHandle, options.verbosity) | 
|  | 164 | 
|  | 165     ml = MergeLists(0) | 
|  | 166     ml.setInputFileName(options.inputFileName1, options.format1, 0) | 
|  | 167     ml.setInputFileName(options.inputFileName2, options.format2, 1) | 
|  | 168     ml.setOutputFileName(options.outputFileName) | 
|  | 169     ml.setAntisense(options.antisense) | 
|  | 170     ml.setColinear(options.colinear) | 
|  | 171     ml.setAggregate(options.all) | 
|  | 172     ml.setNormalization(options.normalize) | 
|  | 173     ml.setDistance(options.distance) | 
|  | 174     ml.run() |