view commons/core/parsing/FindRep.py @ 9:1eb55963fe39

Updated CompareOverlappingSmall*.py
author m-zytnicki
date Thu, 14 Mar 2013 05:23:05 -0400
parents 769e306b7933
children
line wrap: on
line source

import re
from xml.sax.handler import ContentHandler

class FindRep( ContentHandler ):
    def __init__(self,outfileName, filter=0,count=0):
        self.inWindowContent = 0
        self.inSeqNameContent = 0
        self.inStartContent = 0
        self.inEndContent = 0
        self.inPeriodContent = 0
        self.inUnitContent = 0
        self.inScoreContent = 0
        self.count = count
        self._outfileName = outfileName
        self.filter=filter
    
    def startDocument(self):
        self._fileout = open(self._outfileName,"w")
        
    def startElement(self,name,attrs):
        if name=="window":
            self.inWindowContent=1
        elif name=="sequence-name":
            self.inSeqNameContent=1
            self.seqname=""
        elif name=="repeat":
            self.inRepContent=1
            self.start=""
            self.end=""
            self.period=""
            self.type={}
        elif name=="start":
            self.inStartContent=1
        elif name=="end":
            self.inEndContent=1
        elif name=="period":
            self.inPeriodContent=1
        elif name=="unit":
            self.inUnitContent=1
            self.unit=""
        elif name=="score":
            self.inScoreContent=1
            self.score=""

    def characters(self,ch):
        if self.inSeqNameContent:
            self.seqname+=ch
        elif self.inStartContent:
            self.start+=ch
        elif self.inEndContent:
            self.end+=ch
        elif self.inPeriodContent:
            self.period+=ch            
        elif self.inUnitContent:
            self.unit+=ch            
        elif self.inScoreContent:
            self.score+=ch            

    def endElement(self,name):
        if name=="window":
            self.inWindowContent=0
        elif name=="sequence-name":
            self.inSeqNameContent=0
        elif name=="repeat":
            self.inRepContent=0
            start=int(self.start)
            end=int(self.end)
            period=int(self.period)
            score=float(self.score)
            if score>self.filter:
                return
            max = 0
            self.count+=1
            for k,n in self.type.items():
                if n>max:
                    max = n
                    k_max = k

            m=re.match("^[0-9]+.+\{Cut\}",self.seqname)
            if m!=None:
                seqname=self.seqname[m.start(0):m.end(0)-5].rstrip()
                seqname=re.sub("^[0-9]+ ","",seqname).lstrip()
                tok=self.seqname[m.end(0):].split("..")
                astart=start+int(tok[0])-1
                aend=end+int(tok[0])-1
            else:
                astart=start
                aend=end
                seqname=self.seqname
            if len(k_max) > 100:
                k_max=k_max[:48]+"..."+k_max[-51:]
            strout="%d\t(%s)%d\t%s\t%d\t%d"%\
                               (self.count,k_max,(abs(start-end)+1)/period,\
                                seqname,astart,aend)
            self._fileout.write("%s\n"%(strout))

        elif name=="start":
            self.inStartContent=0
        elif name=="end":
            self.inEndContent=0
        elif name=="period":
            self.inPeriodContent=0
        elif name=="score":
            self.inScoreContent=0
        elif name=="unit":
            self.inUnitContent=0
            if self.type.has_key(self.unit):
                self.type[self.unit]+=1
            else:
                self.type[self.unit]=1
                
    def endDocument(self):  
        self._fileout.close()