view commons/tools/CorrelateTEageWithGCcontent.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/env python

import sys
import os
import getopt
import math
from commons.core.sql.DbMySql import DbMySql
from commons.core.sql.TablePathAdaptator import TablePathAdaptator
from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
from commons.core.coord.PathUtils import PathUtils
from commons.core.coord.SetUtils import SetUtils
from commons.core.seq.BioseqUtils import BioseqUtils


class CorrelateTEageWithGCcontent( object ):
    
    def __init__( self ):
        self._inputCoord = ""
        self._inputGenome = ""
        self._inputTErefseq = ""
        self._configFile = ""
        self._outFile = ""
        self._verbose = 0
        self._db = None
        self._tableCoord = ""
        self._pathA = TablePathAdaptator()
        self._tableGenome = ""
        self._seqA = TableSeqAdaptator()
        
        
    def help( self ):
        print
        print "usage: CorrelateTEageWithGCcontent.py [ options ]"
        print "options:"
        print "     -h: this help"
        print "     -i: input TE coordinates (can be file or table)"
        print "         TEs as subjects in 'path' format"
        print "     -g: input genome sequences (can be fasta file or table)"
        print "     -r: input TE reference sequences (can be fasta file or table)"
        print "     -C: configuration file (if table as input)"
        print "     -o: output fasta file (default=inputCoord+'.gc')"
        print "     -v: verbosity level (default=0/1)"
        print
        
        
    def setAttributesFromCmdLine( self ):
        try:
            opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:")
        except getopt.GetoptError, err:
            msg = "%s" % str(err)
            sys.stderr.write( "%s\n" % msg )
            self.help(); sys.exit(1)
        for o,a in opts:
            if o == "-h":
                self.help(); sys.exit(0)
            elif o == "-i":
                self._inputCoord = a
            elif o == "-g":
                self._inputGenome = a
            elif o == "-r":
                self._inputTErefseq = a
            elif o == "-C":
                self._configFile = a
            elif o =="-o":
                self._outFile = a
            elif o == "-v":
                self._verbose = int(a)
                
                
    def checkAttributes( self ):
        if self._inputCoord == "":
            msg = "ERROR: missing input TE coordinates (-i)"
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if not os.path.exists( self._inputCoord ):
            if not os.path.exists( self._configFile ):
                msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile )
                sys.stderr.write( "%s\n" % msg )
                self.help()
                sys.exit(1)
            if not os.path.exists( self._configFile ):
                msg = "ERROR: can't find configuration file '%s'" % ( self._configFile )
                sys.stderr.write( "%s\n" % msg )
                sys.exit(1)
            self._db = DbMySql( cfgFileName=self._configFile )
            if not self._db.exist( self._inputCoord ):
                msg = "ERROR: can't find table '%s'" % ( self._inputCoord )
                sys.stderr.write( "%s\n" % msg )
                self.help()
                sys.exit(1)
            self._tableCoord = self._inputCoord
        else:
            self._tableCoord = self._inputCoord.replace(".","_")
        if self._inputGenome == "":
            msg = "ERROR: missing input genome sequences (-g)"
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if not os.path.exists( self._inputGenome ):
            if not self._db.doesTableExist( self._inputGenome ):
                msg = "ERROR: can't find table '%s'" % ( self._inputGenome )
                sys.stderr.write( "%s\n" % msg )
                self.help()
                sys.exit(1)
            self._tableGenome = self._inputGenome
        else:
            self._tableGenome = self._inputGenome.replace(".","_")
        if self._inputTErefseq == "":
            msg = "ERROR: missing input TE reference sequences (-r)"
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if not os.path.exists( self._inputTErefseq ):
            if not self._db.doesTableExist( self._inputTErefseq ):
                msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq )
                sys.stderr.write( "%s\n" % msg )
                self.help()
                sys.exit(1)
        if self._outFile == "":
            self._outFile = "%s.gc" % ( self._inputCoord )
            
            
    def getLengthOfTErefseqs( self ):
        if os.path.exists( self._inputTErefseq ):
            return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq )
        else:
            dTErefseq2Length = {}
            refseqA = TableSeqAdaptator( self._db, self._inputTErefseq )
            lAccessions = refseqA.getAccessionsList()
            for acc in lAccessions:
                dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc )
            return dTErefseq2Length
        
        
    def start( self ):
        self.checkAttributes()
        if self._verbose > 0:
            print "START CorrelateTEageWithGCcontent.py"
            sys.stdout.flush()
        if os.path.exists( self._inputCoord ):
            self._db = DbMySql( cfgFileName=self._configFile )
            self._db.createTable( self._tableCoord, "path", self._inputCoord, True )
        self._pathA = TablePathAdaptator( self._db, self._tableCoord )
        if os.path.exists( self._inputGenome ):
            self._db.createTable( self._tableGenome, "seq", self._inputGenome, True )
        self._seqA = TableSeqAdaptator( self._db, self._tableGenome )
        if self._verbose > 0:
            print "output fasta file: %s" % self._outFile
            
            
    def end( self ):
        if os.path.exists( self._inputCoord ):
            self._db.dropTable( self._tableCoord )
        if os.path.exists( self._inputGenome ):
            self._db.dropTable( self._tableGenome )
        self._db.close()
        if self._verbose > 0:
            print "END CorrelateTEageWithGCcontent.py"
            sys.stdout.flush()
            
            
    def run( self ):
        self.start()
        
        dTErefseq2Length = self.getLengthOfTErefseqs()
        
        outFileHandler = open( self._outFile, "w" )
        outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" )
        
        lIdentifiers = self._pathA.getIdList()
        nbTEcopies = len(lIdentifiers)
        if self._verbose > 0:
            print "nb of TE copies: %i" % ( nbTEcopies )
            sys.stdout.flush()
        count = 0
        power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1
        for id in lIdentifiers:
            count += 1
            if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0:
                print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) )
                sys.stdout.flush()
            lPaths = self._pathA.getPathListFromId( id )
            lSets = PathUtils.getSetListFromQueries( lPaths )
            lMergedSets = SetUtils.mergeSetsInList( lSets )
            bs = self._seqA.getBioseqFromSetList( lMergedSets )
            data = "%i" % id
            data += "\t%s" % ( bs.header.split("::")[0] )
            data += "\t%s" % ( lPaths[0].getQueryName() )
            data += "\t%i" % ( bs.getLength() )
            data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) )
            data += "\t%.2f" % ( bs.getGCpercentage() )
            data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) )
            outFileHandler.write( "%s\n" % data )
            
        outFileHandler.close()
        
        self.end()
        
        
if __name__ == "__main__":
    i = CorrelateTEageWithGCcontent()
    i.setAttributesFromCmdLine()
    i.run()