view commons/tools/CalcCoordCumulLength.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/env python

"""
Calculate the cumulative length of coordinates data in the L{Map<commons.coreMap>} format.
"""

import os
import sys
import getopt
from pyRepet.launcher.programLauncher import programLauncher
from pyRepet.util.Stat import Stat
from commons.core.checker.CheckerUtils import CheckerUtils


class CalcCoordCumulLength( object ):
    """
    Compute the coverage of coordinates data in the L{Map<commons.core.ccommons.core    """
    
    
    def __init__( self ):
        """
        Constructor.
        """
        self._inFileName = ""
        self._outFileName = ""
        self._verbose = 0
        
        
    def help( self ):
        """
        Display the help on stdout.
        """
        print
        print "usage:",sys.argv[0]," [ options ]"
        print "options:"
        print "     -h: this help"
        print "     -i: name of the input file (format='map')"
        print "     -o: name of the output file (default=inFileName+'.coverage')"
        print
        
        
    def setAttributesFromCmdLine( self ):
        """
        Set the attributes from the command-line.
        """
        try:
            opts, args = getopt.getopt(sys.argv[1:],"hi:o:v:")
        except getopt.GetoptError, err:
            print str(err); self.help(); sys.exit(1)
        for o,a in opts:
            if o == "-h":
                self.help(); sys.exit(0)
            elif o == "-i":
                self.setInputFileName( a )
            elif o == "-o":
                self._outFileName = a
            elif o == "-v":
                self._verbose = int(a)
                
                
    def setInputFileName( self, inFileName ):
        self._inFileName = inFileName
        
    def setVerbose( self, verbose ):
        self._verbose = int(verbose)
        
    def checkAttributes( self ):
        """
        Check the attributes are valid before running the algorithm.
        """
        if self._inFileName == "":
            print "ERROR: missing input file"
            self.help(); sys.exit(1)
        if not os.path.exists( self._inFileName ):
            print "ERROR: can't find file '%s'" % ( self._inFileName )
            self.help(); sys.exit(1)
        if self._outFileName == "":
            self._outFileName = "%s.coverage" % ( self._inFileName )
            
            
    def mergeCoordinates( self ):
        """
        Merge the coordinates with 'mapOp'.
        """
        if self._verbose > 0:
            print "merge the coordinates with mapOp..."; sys.stdout.flush()
        if not CheckerUtils.isExecutableInUserPath( "mapOp" ):
            msg = "ERROR: 'mapOp' is not in your PATH"
            sys.stderr.write( "%s\n" % msg )
            sys.exit(1)
        pL = programLauncher()
        prg = os.environ["REPET_PATH"] + "/bin/mapOp"
        cmd = prg
        cmd += " -q %s" % ( self._inFileName )
        cmd += " -m"
        cmd += " > /dev/null"
        pL.launch( prg, cmd, self._verbose - 1 )
        if self._verbose > 0:
            print "coordinates merged !"; sys.stdout.flush()
        mergeFileName = "%s.merge" % ( self._inFileName )
        inPath, inName = os.path.split( self._inFileName )
        if inPath != "":
            os.system( "mv %s.merge %s" % ( inName, inPath ) )
        return mergeFileName
    
    
    def getStatsPerChr( self, mergeFileName ):
        """
        Return summary statistics on the coordinates, per chromosome.
        @param mergeFileName: L{Map<commons.core.coord.Macommons.coreype mergeFileName: string
        @return: dictionary whose keys are the chromosomes of the 'map file and values are L{Stat<pyRepet.util.Stat>} instances
        """
        dChr2Stats = {}
        if self._verbose > 0:
            print "compute the coverage of the coordinates..."; sys.stdout.flush()
        mergeF = open( mergeFileName, "r" )
        line = mergeF.readline()
        while True:
            if line == "": break
            tokens = line[:-1].split("\t")
            if int(tokens[2]) < int(tokens[3]):
                matchLength = int(tokens[3]) - int(tokens[2]) + 1
            elif int(tokens[2]) > int(tokens[3]):
                matchLength = int(tokens[2]) - int(tokens[3]) + 1
            if not dChr2Stats.has_key( tokens[1] ):
                dChr2Stats[ tokens[1] ] = Stat()
            dChr2Stats[ tokens[1] ].add( matchLength )
            line = mergeF.readline()
        mergeF.close()
        os.remove( mergeFileName )
        return dChr2Stats
    
    
    def saveCumulLength( self, dChr2Stats ):
        """
        Write the stats in the output file.
        """
        outF = open( self._outFileName, "w" )
        totalLength = 0
        for i in dChr2Stats.keys():
            totalLength += dChr2Stats[i].sum
            string = "cumulative length (in bp) on '%s': %i" % ( i, dChr2Stats[i].sum )
            outF.write( "%s\n" % ( string ) )
            if self._verbose > 0: print string
        string = "total cumulative length (in bp): %i" % ( totalLength )
        outF.write( "%s\n" % ( string ) )
        if self._verbose > 0: print string
        outF.close()
        sys.stdout.flush()
        
        
    def start( self ):
        """
        Useful commands before running the program.
        """
        if self._verbose > 0:
            print "beginning of %s" % (sys.argv[0].split("/")[-1]); sys.stdout.flush()
        self.checkAttributes()
        if self._verbose > 0:
            print "input file : '%s'" % ( self._inFileName )
            sys.stdout.flush()
            
            
    def end( self ):
        """
        Useful commands before ending the program.
        """
        if self._verbose > 0:
            print "%s finished successfully" % (sys.argv[0].split("/")[-1]); sys.stdout.flush()
            
            
    def run( self ):
        """
        Run the program.
        """
        self.start()
        mergeFileName = self.mergeCoordinates()
        dChr2Stats = self.getStatsPerChr( mergeFileName )
        self.saveCumulLength( dChr2Stats )
        self.end()
        
        
if __name__ == '__main__':
    i = CalcCoordCumulLength()
    i.setAttributesFromCmdLine()
    i.run()