view commons/tools/SpliceTEsFromGenome.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

import sys
import os
import getopt

from commons.core.sql.DbMySql import DbMySql
from commons.core.seq.FastaUtils import FastaUtils
from commons.core.coord.MapUtils import MapUtils
from commons.core.coord.AlignUtils import AlignUtils
from commons.core.coord.PathUtils import PathUtils


class SpliceTEsFromGenome( object ):
    
    def __init__( self ):
        self._inputData = ""
        self._formatData = ""
        self._genomeFile = ""
        self._configFile = ""
        self._outFile = ""
        self._verbose = 0
        self._db = None
        
        
    def help( self ):
        print
        print "usage: SpliceTEsFromGenome.py [ options ]"
        print "options:"
        print "     -h: this help"
        print "     -i: input TE coordinates (can be file or table)"
        print "         TEs as subjects if align or path format"
        print "     -f: format of the data (map/align/path)"
        print "     -g: genome file (format=fasta)"
        print "     -C: configuration file (if table as input)"
        print "     -o: output fasta file (default=genomeFile+'.splice')"
        print "     -v: verbosity level (default=0/1)"
        print
        
        
    def setAttributesFromCmdLine( self ):
        try:
            opts, args = getopt.getopt(sys.argv[1:],"hi:f:g:C:o:v:")
        except getopt.GetoptError, err:
            msg = "%s" % str(err)
            sys.stderr.write( "%s\n" % msg )
            self.help(); sys.exit(1)
        for o,a in opts:
            if o == "-h":
                self.help(); sys.exit(0)
            elif o == "-i":
                self._inputData = a
            elif o == "-f":
                self._formatData = a
            elif o == "-g":
                self._genomeFile = a
            elif o == "-C":
                self._configFile = a
            elif o =="-o":
                self._outFile = a
            elif o == "-v":
                self._verbose = int(a)
                
                
    def checkAttributes( self ):
        if self._inputData == "":
            msg = "ERROR: missing input data (-i)"
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if not os.path.exists( self._inputData ):
            if not os.path.exists( self._configFile ):
                msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputData, self._configFile )
                sys.stderr.write( "%s\n" % msg )
                self.help()
                sys.exit(1)
            if not os.path.exists( self._configFile ):
                msg = "ERROR: can't find config file '%s'" % ( self._configFile )
                sys.stderr.write( "%s\n" % msg )
                sys.exit(1)
            self._db = DbMySql( cfgFileName=self._configFile )
            if not self._db.doesTableExist( self._inputData ):
                msg = "ERROR: can't find table '%s'" % ( self._inputData )
                sys.stderr.write( "%s\n" % msg )
                self.help()
                sys.exit(1)
        if self._formatData == "":
            msg = "ERROR: need to precise format (-f)"
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if self._formatData not in [ "map", "align", "path" ]:
            msg = "ERROR: format '%s' not yet supported" % ( self._formatData )
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if self._genomeFile == "":
            msg = "ERROR: missing genome file (-g)"
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if not os.path.exists( self._genomeFile ):
            msg = "ERROR: can't find genome file '%s'" % ( self._genomeFile )
            sys.stderr.write( "%s\n" % msg )
            self.help()
            sys.exit(1)
        if self._outFile == "":
            self._outFile = "%s.splice" % ( self._genomeFile )
            if self._verbose > 0:
                print "output fasta file: %s" % self._outFile
                
                
    def getCoordsAsMapFile( self ):
        if self._verbose > 0:
            print "get TE coordinates as 'Map' file"
            sys.stdout.flush()
        if self._db != None:
            cmd = "srptExportTable.py"
            cmd += " -i %s" % ( self._inputData )
            cmd += " -C %s" % ( self._configFile )
            cmd += " -o %s.%s" % ( self._inputData, self._formatData )
            returnStatus = os.system( cmd )
            if returnStatus != 0:
                msg = "ERROR while exporting data from table"
                sys.stderr.write( "%s\n" % msg )
                sys.exit(1)
            self._inputData += ".%s" % ( self._formatData )
           
        if self._formatData == "map":
            return self._inputData
        elif self._formatData == "align":
            mapFile = "%s.map" % ( self._inputData )
            AlignUtils.convertAlignFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
            return mapFile
        elif self._formatData == "path":
            mapFile = "%s.map" % ( self._inputData )
            PathUtils.convertPathFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
            return mapFile
        
        
    def mergeCoordsInMapFile( self, mapFile ):
        if self._verbose > 0:
            print "merge TE coordinates"
            sys.stdout.flush()
        mergeFile = "%s.merge" % ( mapFile )
        MapUtils.mergeCoordsInFile( mapFile, mergeFile )
        if self._formatData != "map" or self._db != None:
            os.remove( mapFile )
        return mergeFile
    
    
    def spliceFastaFromCoords( self, mergeFile ):
        if self._verbose > 0:
            print "splice TE copies from the genome"
            sys.stdout.flush()
        FastaUtils.spliceFromCoords( self._genomeFile,
                                     mergeFile,
                                     self._outFile )
    
        os.remove( mergeFile )
        
        
    def start( self ):
        self.checkAttributes()
        if self._verbose > 0:
            print "START SpliceTEsFromGenome.py"
            sys.stdout.flush()
            
            
    def end( self ):
        if self._db != None:
            self._db.close()
        if self._verbose > 0:
            print "END SpliceTEsFromGenome.py"
            sys.stdout.flush()
            
            
    def run( self ):
        self.start()
        
        mapFile = self.getCoordsAsMapFile()

        mergeFile = self.mergeCoordsInMapFile( mapFile )
        
        self.spliceFastaFromCoords( mergeFile )
        
        self.end()
        
        
if __name__ == "__main__":
    i = SpliceTEsFromGenome()
    i.setAttributesFromCmdLine()
    i.run()