comparison SMART/Java/Python/clusterize.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
29 # knowledge of the CeCILL license and that you accept its terms. 29 # knowledge of the CeCILL license and that you accept its terms.
30 # 30 #
31 from commons.core.writer.WriterChooser import WriterChooser 31 from commons.core.writer.WriterChooser import WriterChooser
32 """Clusterize a set of transcripts""" 32 """Clusterize a set of transcripts"""
33 33
34 import os 34 import os, os.path, random
35 from optparse import OptionParser 35 from optparse import OptionParser
36 from commons.core.parsing.ParserChooser import ParserChooser 36 from commons.core.parsing.ParserChooser import ParserChooser
37 from commons.core.writer.Gff3Writer import Gff3Writer 37 from commons.core.writer.Gff3Writer import Gff3Writer
38 from SMART.Java.Python.structure.Transcript import Transcript 38 from SMART.Java.Python.structure.Transcript import Transcript
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle 39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
40 from SMART.Java.Python.ncList.FileSorter import FileSorter 40 from SMART.Java.Python.ncList.FileSorter import FileSorter
41 from SMART.Java.Python.misc.Progress import Progress 41 from SMART.Java.Python.misc.Progress import Progress
42 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
42 43
43 class Clusterize(object): 44 class Clusterize(object):
44
45 def __init__(self, verbosity):
46 self.normalize = False
47 self.presorted = False
48 self.distance = 1
49 self.colinear = False
50 self.nbWritten = 0
51 self.nbMerges = 0
52 self.verbosity = verbosity
53 self.splittedFileNames = {}
54 45
55 def __del__(self): 46 def __init__(self, verbosity):
56 for fileName in self.splittedFileNames.values(): 47 self.normalize = False
57 os.remove(fileName) 48 self.presorted = False
49 self.distance = 1
50 self.colinear = False
51 self.nbWritten = 0
52 self.nbMerges = 0
53 self.verbosity = verbosity
54 self.splittedFileNames = {}
58 55
59 def setInputFile(self, fileName, format): 56 def __del__(self):
60 parserChooser = ParserChooser(self.verbosity) 57 for fileName in self.splittedFileNames.values():
61 parserChooser.findFormat(format) 58 os.remove(fileName)
62 self.parser = parserChooser.getParser(fileName)
63 self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
64 59
65 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): 60 def setInputFile(self, fileName, format):
66 writerChooser = WriterChooser() 61 parserChooser = ParserChooser(self.verbosity)
67 writerChooser.findFormat(format) 62 parserChooser.findFormat(format)
68 self.writer = writerChooser.getWriter(fileName) 63 self.parser = parserChooser.getParser(fileName)
69 self.writer.setTitle(title) 64 self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
70 self.writer.setFeature(feature) 65 if "SMARTTMPPATH" in os.environ:
71 self.writer.setFeaturePart(featurePart) 66 self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName))
72 67
73 def setDistance(self, distance): 68 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
74 self.distance = distance 69 writerChooser = WriterChooser()
70 writerChooser.findFormat(format)
71 self.writer = writerChooser.getWriter(fileName)
72 self.writer.setTitle(title)
73 self.writer.setFeature(feature)
74 self.writer.setFeaturePart(featurePart)
75 75
76 def setColinear(self, colinear): 76 def setDistance(self, distance):
77 self.colinear = colinear 77 self.distance = distance
78 78
79 def setNormalize(self, normalize): 79 def setColinear(self, colinear):
80 self.normalize = normalize 80 self.colinear = colinear
81
82 def setPresorted(self, presorted):
83 self.presorted = presorted
84 81
85 def _sortFile(self): 82 def setNormalize(self, normalize):
86 fs = FileSorter(self.parser, self.verbosity-4) 83 self.normalize = normalize
87 fs.perChromosome(True) 84
88 fs.setPresorted(self.presorted) 85 def setPresorted(self, presorted):
89 fs.setOutputFileName(self.sortedFileName) 86 self.presorted = presorted
90 fs.sort()
91 self.splittedFileNames = fs.getOutputFileNames()
92 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
93 self.nbElements = fs.getNbElements()
94
95 def _iterate(self, chromosome):
96 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
97 transcripts = []
98 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
99 for newTranscript in parser.getIterator():
100 newTranscripts = []
101 for oldTranscript in transcripts:
102 if self._checkOverlap(newTranscript, oldTranscript):
103 self._merge(newTranscript, oldTranscript)
104 elif self._checkPassed(newTranscript, oldTranscript):
105 self._write(oldTranscript)
106 else:
107 newTranscripts.append(oldTranscript)
108 newTranscripts.append(newTranscript)
109 transcripts = newTranscripts
110 progress.inc()
111 for transcript in transcripts:
112 self._write(transcript)
113 progress.done()
114 87
115 def _merge(self, transcript1, transcript2): 88 def _sortFile(self):
116 self.nbMerges += 1 89 if self.presorted:
117 transcript2.setDirection(transcript1.getDirection()) 90 return
118 transcript1.merge(transcript2) 91 fs = FileSorter(self.parser, self.verbosity-4)
92 fs.perChromosome(True)
93 fs.setPresorted(self.presorted)
94 fs.setOutputFileName(self.sortedFileName)
95 fs.sort()
96 self.splittedFileNames = fs.getOutputFileNames()
97 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
98 self.nbElements = fs.getNbElements()
99
100 def _iterate(self, chromosome):
101 if chromosome == None:
102 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
103 parser = self.parser
104 else:
105 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
106 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
107 transcripts = []
108 self.nbElements = 0
109 for newTranscript in parser.getIterator():
110 newTranscripts = []
111 if newTranscript.__class__.__name__ == "Mapping":
112 newTranscript = newTranscript.getTranscript()
113 for oldTranscript in transcripts:
114 if self._checkOverlap(newTranscript, oldTranscript):
115 self._merge(newTranscript, oldTranscript)
116 elif self._checkPassed(newTranscript, oldTranscript):
117 self._write(oldTranscript)
118 else:
119 newTranscripts.append(oldTranscript)
120 newTranscripts.append(newTranscript)
121 transcripts = newTranscripts
122 self.nbElements += 1
123 progress.inc()
124 for transcript in transcripts:
125 self._write(transcript)
126 progress.done()
119 127
120 def _write(self, transcript): 128 def _merge(self, transcript1, transcript2):
121 self.nbWritten += 1 129 self.nbMerges += 1
122 self.writer.addTranscript(transcript) 130 transcript2.setDirection(transcript1.getDirection())
131 transcript1.merge(transcript2)
123 132
124 def _checkOverlap(self, transcript1, transcript2): 133 def _write(self, transcript):
125 if self.colinear and transcript1.getDirection() != transcript2.getDirection(): 134 self.nbWritten += 1
126 return False 135 self.writer.addTranscript(transcript)
127 if transcript1.getDistance(transcript2) > self.distance:
128 return False
129 return True
130 136
131 def _checkPassed(self, transcript1, transcript2): 137 def _checkOverlap(self, transcript1, transcript2):
132 return (transcript1.getDistance(transcript2) > self.distance) 138 if transcript1.getChromosome() != transcript2.getChromosome():
139 return False
140 if self.colinear and transcript1.getDirection() != transcript2.getDirection():
141 return False
142 if transcript1.getDistance(transcript2) > self.distance:
143 return False
144 return True
133 145
134 def run(self): 146 def _checkPassed(self, transcript1, transcript2):
135 self._sortFile() 147 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))
136 for chromosome in sorted(self.splittedFileNames.keys()): 148
137 self._iterate(chromosome) 149 def run(self):
138 self.writer.close() 150 self._sortFile()
139 if self.verbosity > 0: 151 if self.presorted:
140 print "# input: %d" % (self.nbElements) 152 self._iterate(None)
141 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) 153 else:
142 print "# merges: %d" % (self.nbMerges) 154 for chromosome in sorted(self.splittedFileNames.keys()):
143 155 self._iterate(chromosome)
156 self.writer.close()
157 if self.verbosity > 0:
158 print "# input: %d" % (self.nbElements)
159 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))
160 print "# merges: %d" % (self.nbMerges)
161
144 162
145 if __name__ == "__main__": 163 if __name__ == "__main__":
146 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" 164 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
147 165
148 parser = OptionParser(description = description) 166 parser = OptionParser(description = description)
149 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") 167 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
150 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") 168 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]")
151 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") 169 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
152 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") 170 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]")
153 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") 171 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]")
154 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") 172 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")
155 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") 173 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
156 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") 174 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]")
157 (options, args) = parser.parse_args() 175 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
158 176 (options, args) = parser.parse_args()
159 c = Clusterize(options.verbosity) 177
160 c.setInputFile(options.inputFileName, options.format) 178 c = Clusterize(options.verbosity)
161 c.setOutputFileName(options.outputFileName, options.outputFormat) 179 c.setInputFile(options.inputFileName, options.format)
162 c.setColinear(options.colinear) 180 c.setOutputFileName(options.outputFileName, options.outputFormat)
163 c.setDistance(options.distance) 181 c.setColinear(options.colinear)
164 c.setNormalize(options.normalize) 182 c.setDistance(options.distance)
165 c.run() 183 c.setNormalize(options.normalize)
184 c.setPresorted(options.sorted)
185 c.run()