comparison SMART/Java/Python/GetFlanking.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
42 TAG_DISTANCE = "distance_" 42 TAG_DISTANCE = "distance_"
43 TAG_SENSE = "_sense" 43 TAG_SENSE = "_sense"
44 TAG_REGION = "_region" 44 TAG_REGION = "_region"
45 TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"} 45 TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"}
46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} 46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}
47 TAGS_SENSE = {-1: "antisense", 0: "", 1: "colinear"} 47 TAGS_SENSE = {-1: "antisense", 0: "", 1: "collinear"}
48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} 48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}
49 49
50 50
51 def getOrderKey(transcript, direction): 51 def getOrderKey(transcript, direction, input):
52 if direction == 1: 52 if direction == 1:
53 return transcript.getEnd() 53 if input == QUERY:
54 return - transcript.getStart() 54 return (transcript.getEnd(), -transcript.getStart())
55 55 return (transcript.getStart(), -transcript.getEnd())
56 def isInGoodRegion(transcriptRef, transcriptQuery, direction): 56 if input == QUERY:
57 if direction == 1: 57 return (-transcript.getStart(), transcript.getEnd())
58 return transcriptQuery.getEnd() > transcriptRef.getEnd() 58 return (-transcript.getEnd(), transcript.getStart())
59 return transcriptQuery.getStart() < transcriptRef.getStart()
60 59
61 60
62 class GetFlanking(object): 61 class GetFlanking(object):
63 62
64 def __init__(self, verbosity): 63 def __init__(self, verbosity):
65 self.verbosity = verbosity 64 self.verbosity = verbosity
66 self.transcripts = dict([id, {}] for id in INPUTS) 65 self.transcripts = dict([id, {}] for id in INPUTS)
67 self.directions = [] 66 self.directions = []
68 self.noOverlap = False 67 self.noOverlap = False
69 self.colinear = False 68 self.colinear = False
70 self.antisense = False 69 self.antisense = False
71 self.distance = None 70 self.distance = None
72 self.minDistance = None 71 self.minDistance = None
73 self.maxDistance = None 72 self.maxDistance = None
74 self.tagName = "flanking" 73 self.tagName = "flanking"
75 74
76 def setInputFile(self, fileName, format, id): 75 def setInputFile(self, fileName, format, id):
77 chooser = ParserChooser(self.verbosity) 76 chooser = ParserChooser(self.verbosity)
78 chooser.findFormat(format) 77 chooser.findFormat(format)
79 parser = chooser.getParser(fileName) 78 parser = chooser.getParser(fileName)
80 for transcript in parser.getIterator(): 79 for transcript in parser.getIterator():
81 chromosome = transcript.getChromosome() 80 chromosome = transcript.getChromosome()
82 if chromosome not in self.transcripts[id]: 81 if chromosome not in self.transcripts[id]:
83 self.transcripts[id][chromosome] = [] 82 self.transcripts[id][chromosome] = []
84 self.transcripts[id][chromosome].append(transcript) 83 self.transcripts[id][chromosome].append(transcript)
85 84
86 def setOutputFile(self, fileName): 85 def setOutputFile(self, fileName):
87 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) 86 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
88 87
89 def addUpstreamDirection(self, upstream): 88 def addUpstreamDirection(self, upstream):
90 if upstream: 89 if upstream:
91 self.directions.append(-1) 90 self.directions.append(-1)
92 91
93 def addDownstreamDirection(self, downstream): 92 def addDownstreamDirection(self, downstream):
94 if downstream: 93 if downstream:
95 self.directions.append(1) 94 self.directions.append(1)
96 95
97 def setColinear(self, colinear): 96 def setColinear(self, colinear):
98 self.colinear = colinear 97 self.colinear = colinear
99 98
100 def setAntisense(self, antisense): 99 def setAntisense(self, antisense):
101 self.antisense = antisense 100 self.antisense = antisense
102 101
103 def setNoOverlap(self, noOverlap): 102 def setNoOverlap(self, noOverlap):
104 self.noOverlap = noOverlap 103 self.noOverlap = noOverlap
105 104
106 def setMinDistance(self, distance): 105 def setMinDistance(self, distance):
107 self.minDistance = distance 106 self.minDistance = distance
108 107
109 def setMaxDistance(self, distance): 108 def setMaxDistance(self, distance):
110 self.maxDistance = distance 109 self.maxDistance = distance
111 110
112 def setNewTagName(self, tagName): 111 def setNewTagName(self, tagName):
113 self.tagName = tagName 112 self.tagName = tagName
114 113
115 def match(self, transcriptRef, transcriptQuery, direction): 114 def match(self, transcriptQuery, transcriptRef, direction):
116 if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): 115 #print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction
117 return False 116 if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart():
118 if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): 117 return False
119 return False 118 if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart():
120 if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): 119 return False
121 return False 120 if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
122 if self.minDistance != None or self.maxDistance != None: 121 return False
123 distance = transcriptRef.getDistance(transcriptQuery) 122 if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
124 if self.minDistance != None and distance < self.minDistance: 123 return False
125 return False 124 if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
126 if self.maxDistance != None and distance > self.maxDistance: 125 return False
127 return False 126 if self.minDistance != None or self.maxDistance != None:
128 return True 127 distance = transcriptRef.getDistance(transcriptQuery)
129 128 if self.minDistance != None and distance < self.minDistance:
130 def getFlanking(self, direction): 129 return False
131 for chromosome in sorted(self.transcripts[REFERENCE].keys()): 130 if self.maxDistance != None and distance > self.maxDistance:
132 if chromosome not in self.transcripts[QUERY]: 131 return False
133 continue 132 return True
134 sortedTranscripts = dict([id, {}] for id in INPUTS) 133
135 for id in INPUTS: 134 def getFlanking(self, chromosome, direction):
136 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction)) 135 if chromosome not in self.transcripts[REFERENCE]:
137 refIndex = 0 136 return
138 currentRefs = [] 137 sortedTranscripts = dict([id, {}] for id in INPUTS)
139 outputs = set() 138 for id in INPUTS:
140 progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) 139 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id))
141 for query in sortedTranscripts[QUERY]: 140 refIndex = 0
142 while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction): 141 progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
143 currentRefs.append(sortedTranscripts[REFERENCE][refIndex]) 142 for query in sortedTranscripts[QUERY]:
144 refIndex += 1 143 #print "Q: ", query
145 nextCurrentRefs = [] 144 #print "R1: ", sortedTranscripts[REFERENCE][refIndex]
146 for currentRef in currentRefs: 145 while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction):
147 if self.match(currentRef, query, direction): 146 refIndex += 1
148 if currentRef not in self.flankings: 147 if refIndex == len(sortedTranscripts[REFERENCE]):
149 self.flankings[currentRef] = {} 148 progress.done()
150 self.flankings[currentRef][direction * currentRef.getDirection()] = query 149 #print "done"
151 else: 150 return
152 nextCurrentRefs.append(currentRef) 151 #print "R2: ", sortedTranscripts[REFERENCE][refIndex]
153 currentRefs = nextCurrentRefs 152 self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex]
154 progress.inc() 153 progress.inc()
155 progress.done() 154 progress.done()
156 155
157 def setTags(self, query, reference, direction): 156 def setTags(self, query, reference, direction):
158 refName = reference.getTagValue("ID") 157 refName = reference.getTagValue("ID")
159 if refName == None: 158 if refName == None:
160 refName = reference.getName() 159 refName = reference.getName()
161 if refName == None: 160 if refName == None:
162 refName = reference.__str__() 161 refName = reference.__str__()
163 query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName) 162 query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName)
164 query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference)) 163 query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference))
165 if direction == 0: 164 query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
166 query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) 165 if direction == 0:
167 query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) 166 query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
168 for tag in reference.getTagNames(): 167 for tag in reference.getTagNames():
169 if tag not in ("quality", "feature"): 168 if tag not in ("quality", "feature"):
170 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag)) 169 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag))
171 return query 170 return query
172 171
173 def write(self): 172 def write(self):
174 outputs = set() 173 progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
175 progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) 174 for transcriptQuery in self.flankings.keys():
176 for transcriptRef in self.flankings.keys(): 175 if not self.flankings[transcriptQuery]:
177 if self.directions: 176 self.writer.addTranscript(transcriptQuery)
178 for direction in self.directions: 177 elif self.directions:
179 if direction in self.flankings[transcriptRef]: 178 for direction in self.directions:
180 query = self.flankings[transcriptRef][direction] 179 #relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction
181 outputs.add(self.setTags(query, transcriptRef, direction)) 180 relativeDirection = direction * transcriptQuery.getDirection()
182 else: 181 if relativeDirection in self.flankings[transcriptQuery]:
183 if self.flankings[transcriptRef]: 182 transcriptRef = self.flankings[transcriptQuery][relativeDirection]
184 query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0] 183 transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection)
185 outputs.add(self.setTags(query, transcriptRef, 0)) 184 self.writer.addTranscript(transcriptQuery)
186 progress.inc() 185 else:
187 for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())): 186 transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0]
188 self.writer.addTranscript(transcript) 187 self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0))
189 self.writer.close() 188 progress.inc()
190 progress.done() 189 progress.done()
191 190
192 def run(self): 191 def run(self):
193 self.flankings = {} 192 for chromosome in sorted(self.transcripts[QUERY].keys()):
194 for direction in STRANDS: 193 self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome])
195 self.getFlanking(direction) 194 for direction in STRANDS:
196 self.write() 195 #print "comparison", chromosome, direction
196 self.getFlanking(chromosome, direction)
197 self.write()
198 self.writer.close()
197 199
198 if __name__ == "__main__": 200 if __name__ == "__main__":
199 201
200 description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" 202 description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
201 203
202 parser = OptionParser(description = description) 204 parser = OptionParser(description = description)
203 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") 205 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
204 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") 206 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
205 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") 207 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
206 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") 208 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
207 parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]") 209 parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]")
208 parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]") 210 parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]")
209 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]") 211 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]")
210 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]") 212 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]")
211 parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") 213 parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
212 parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]") 214 parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]")
213 parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]") 215 parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]")
214 parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") 216 parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
215 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") 217 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
216 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 218 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
217 (options, args) = parser.parse_args() 219 (options, args) = parser.parse_args()
218 220
219 gf = GetFlanking(options.verbosity) 221 gf = GetFlanking(options.verbosity)
220 gf.setInputFile(options.inputFileName1, options.format1, QUERY) 222 gf.setInputFile(options.inputFileName1, options.format1, QUERY)
221 gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) 223 gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
222 gf.setOutputFile(options.outputFileName) 224 gf.setOutputFile(options.outputFileName)
223 gf.addUpstreamDirection(options.upstream) 225 gf.addUpstreamDirection(options.upstream)
224 gf.addDownstreamDirection(options.downstream) 226 gf.addDownstreamDirection(options.downstream)
225 gf.setColinear(options.colinear) 227 gf.setColinear(options.colinear)
226 gf.setAntisense(options.antisense) 228 gf.setAntisense(options.antisense)
227 gf.setNoOverlap(options.noOverlap) 229 gf.setNoOverlap(options.noOverlap)
228 gf.setMinDistance(options.minDistance) 230 gf.setMinDistance(options.minDistance)
229 gf.setMaxDistance(options.maxDistance) 231 gf.setMaxDistance(options.maxDistance)
230 gf.setNewTagName(options.tagName) 232 gf.setNewTagName(options.tagName)
231 gf.run() 233 gf.run()