view commons/tools/MergeMatchsFiles.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

from commons.core.utils.FileUtils import FileUtils
from commons.core.coord.Align import Align
import shutil
import os
import sys

class MergeMatchsFiles(object):

    def __init__(self, fileType, outFileBaseName, allByAll = False, clean = True, verbose = 0):
        self._fileType = fileType
        self._outFileBaseName = outFileBaseName
        self._allByAll = allByAll
        self._verbose = verbose
        self._clean = clean

    def _filterRedundantMatches( self, inFile, outFile ):
        """
        When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
        one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
        and we discards 'chunk7-11-110-chunk3-1-100-...'.
        Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
        'chunk5-11-110-chunk5-1-100-...'.
        For this of course the results need to be sorted by query, on plus strand,
        and in ascending coordinates (always the case with Blaster).
        """
        inFileHandler = open( inFile, "r" )
        outFileHandler = open( outFile, "w" )
        iAlign = Align()
        countMatches = 0
        tick = 100000
        while True:
            line = inFileHandler.readline()
            if line == "":
                break
            countMatches += 1
            iAlign.setFromString( line )
            if "chunk" not in iAlign.range_query.seqname \
                   or "chunk" not in iAlign.range_subject.seqname:
                print "ERROR: 'chunk' not in seqname"
                sys.exit(1)
            if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
                iAlign.write( outFileHandler )
            elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
                if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
                    iAlign.write( outFileHandler )
            if countMatches % tick == 0:   # need to free buffer frequently as file can be big
                outFileHandler.flush()
                os.fsync( outFileHandler.fileno() )
        inFileHandler.close()
        outFileHandler.close()

    def run(self):
        if self._verbose > 1:
            print "concatenate the results of each job"
            sys.stdout.flush()
            
        tmpFileName = "%s.%s_tmp" % (self._outFileBaseName, self._fileType)
        outFileName = "%s.%s" % (self._outFileBaseName, self._fileType)
        pattern = "*.%s" % self._fileType
    
        if os.path.exists(tmpFileName):
            os.remove(tmpFileName)
    
        FileUtils.catFilesByPattern(pattern, tmpFileName)
        if self._clean:
            FileUtils.removeFilesByPattern(pattern)
    
        if self._fileType == "align":
            if self._allByAll:
                self._filterRedundantMatches(tmpFileName, outFileName)
            else:
                shutil.move(tmpFileName, outFileName)
        else:
            prg = "%s/bin/%snum2id" % (os.environ["REPET_PATH"], self._fileType)
            cmd = prg
            cmd += " -i %s" % tmpFileName
            cmd += " -o %s" % outFileName
            cmd += " -v %i" % (self._verbose - 1)
            log = os.system(cmd)
            if log != 0:
                print "*** Error: %s returned %i" % (prg, log)
                sys.exit(1)
        if self._clean and FileUtils.isRessourceExists(tmpFileName):
            os.remove(tmpFileName)