Mercurial > repos > yufei-luo > s_mart
view commons/core/parsing/FindRep.py @ 69:1473ab954708 draft
Corrected bug in "CollapsedReads" XML file.
author | m-zytnicki |
---|---|
date | Wed, 18 Nov 2015 10:59:02 -0500 |
parents | 769e306b7933 |
children |
line wrap: on
line source
import re from xml.sax.handler import ContentHandler class FindRep( ContentHandler ): def __init__(self,outfileName, filter=0,count=0): self.inWindowContent = 0 self.inSeqNameContent = 0 self.inStartContent = 0 self.inEndContent = 0 self.inPeriodContent = 0 self.inUnitContent = 0 self.inScoreContent = 0 self.count = count self._outfileName = outfileName self.filter=filter def startDocument(self): self._fileout = open(self._outfileName,"w") def startElement(self,name,attrs): if name=="window": self.inWindowContent=1 elif name=="sequence-name": self.inSeqNameContent=1 self.seqname="" elif name=="repeat": self.inRepContent=1 self.start="" self.end="" self.period="" self.type={} elif name=="start": self.inStartContent=1 elif name=="end": self.inEndContent=1 elif name=="period": self.inPeriodContent=1 elif name=="unit": self.inUnitContent=1 self.unit="" elif name=="score": self.inScoreContent=1 self.score="" def characters(self,ch): if self.inSeqNameContent: self.seqname+=ch elif self.inStartContent: self.start+=ch elif self.inEndContent: self.end+=ch elif self.inPeriodContent: self.period+=ch elif self.inUnitContent: self.unit+=ch elif self.inScoreContent: self.score+=ch def endElement(self,name): if name=="window": self.inWindowContent=0 elif name=="sequence-name": self.inSeqNameContent=0 elif name=="repeat": self.inRepContent=0 start=int(self.start) end=int(self.end) period=int(self.period) score=float(self.score) if score>self.filter: return max = 0 self.count+=1 for k,n in self.type.items(): if n>max: max = n k_max = k m=re.match("^[0-9]+.+\{Cut\}",self.seqname) if m!=None: seqname=self.seqname[m.start(0):m.end(0)-5].rstrip() seqname=re.sub("^[0-9]+ ","",seqname).lstrip() tok=self.seqname[m.end(0):].split("..") astart=start+int(tok[0])-1 aend=end+int(tok[0])-1 else: astart=start aend=end seqname=self.seqname if len(k_max) > 100: k_max=k_max[:48]+"..."+k_max[-51:] strout="%d\t(%s)%d\t%s\t%d\t%d"%\ (self.count,k_max,(abs(start-end)+1)/period,\ seqname,astart,aend) self._fileout.write("%s\n"%(strout)) elif name=="start": self.inStartContent=0 elif name=="end": self.inEndContent=0 elif name=="period": self.inPeriodContent=0 elif name=="score": self.inScoreContent=0 elif name=="unit": self.inUnitContent=0 if self.type.has_key(self.unit): self.type[self.unit]+=1 else: self.type[self.unit]=1 def endDocument(self): self._fileout.close()