| 18 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 """ | 
|  | 4 Interface to orient sequences before making a multiple alignment. | 
|  | 5 Use hashing or suffix tree to get an idea of the appropriate strand. | 
|  | 6 Use 'orienter' by default, otherwise use 'mummer'. | 
|  | 7 """ | 
|  | 8 | 
|  | 9 import sys | 
|  | 10 import os | 
|  | 11 import glob | 
|  | 12 import getopt | 
|  | 13 | 
|  | 14 from commons.core.seq.BioseqDB import BioseqDB | 
|  | 15 import pyRepet.seq.fastaDB | 
|  | 16 from commons.core.checker.CheckerUtils import CheckerUtils | 
|  | 17 | 
|  | 18 class OrientSequences( object ): | 
|  | 19     """ | 
|  | 20     Interface to orient sequences before making a multiple alignment. | 
|  | 21     Use hashing or suffix tree to get an idea of the appropriate strand. | 
|  | 22     Use 'orienter' by default, otherwise use 'mummer'. | 
|  | 23     """ | 
|  | 24 | 
|  | 25     def __init__(self, inFileName="", minMatchLength=10, prgToOrient = "orienter", outFileName="", clean=False, verbosity=1): | 
|  | 26         """ | 
|  | 27         Constructor. | 
|  | 28         """ | 
|  | 29         self._inFileName = inFileName | 
|  | 30         self._minMatchLength = minMatchLength | 
|  | 31         self._prgToOrient = prgToOrient | 
|  | 32         self._outFileName = outFileName | 
|  | 33         self._clean = clean | 
|  | 34         self._verbose = verbosity | 
|  | 35 | 
|  | 36     def help( self ): | 
|  | 37         """ | 
|  | 38         Display the help on stdout. | 
|  | 39         """ | 
|  | 40         print | 
|  | 41         print "usage:",sys.argv[0].split("/")[-1],"[options]" | 
|  | 42         print "options:" | 
|  | 43         print "     -h: this help" | 
|  | 44         print "     -i: name of the input file (format='fasta')" | 
|  | 45         print "     -m: minimum match length (default=10)" | 
|  | 46         print "     -p: program to use first (default=orienter/mummer)" | 
|  | 47         print "     -o: name of the output file (default=inFileName+'.oriented')" | 
|  | 48         print "     -c: clean" | 
|  | 49         print "     -v: verbosity level (0/default=1/2)" | 
|  | 50         print | 
|  | 51 | 
|  | 52     def setAttributesFromCmdLine( self ): | 
|  | 53         """ | 
|  | 54         Set the attributes from the command-line. | 
|  | 55         """ | 
|  | 56         try: | 
|  | 57             opts, args = getopt.getopt(sys.argv[1:],"hi:m:p:o:cv:") | 
|  | 58         except getopt.GetoptError, err: | 
|  | 59             print str(err); self.help(); sys.exit(1) | 
|  | 60         for o,a in opts: | 
|  | 61             if o == "-h": | 
|  | 62                 self.help(); sys.exit(0) | 
|  | 63             elif o == "-i": | 
|  | 64                 self.setInputFileName( a ) | 
|  | 65             elif o == "-m": | 
|  | 66                 self.setMinMatchLength( a ) | 
|  | 67             elif o == "-p": | 
|  | 68                 self.setPrgToOrient( a ) | 
|  | 69             elif o == "-o": | 
|  | 70                 self.setOutputFileName( a ) | 
|  | 71             elif o == "-c": | 
|  | 72                 self.setClean() | 
|  | 73             elif o == "-v": | 
|  | 74                 self.setVerbosityLevel( a ) | 
|  | 75 | 
|  | 76     def setInputFileName( self, inFileName ): | 
|  | 77         self._inFileName = inFileName | 
|  | 78 | 
|  | 79     def setMinMatchLength( self, minMatchLength ): | 
|  | 80         self._minMatchLength = int(minMatchLength) | 
|  | 81 | 
|  | 82     def setPrgToOrient( self, prgToOrient ): | 
|  | 83         self._prgToOrient = prgToOrient | 
|  | 84 | 
|  | 85     def setOutputFileName( self, outFileName ): | 
|  | 86         self._outFileName = outFileName | 
|  | 87 | 
|  | 88     def setClean( self ): | 
|  | 89         self._clean = True | 
|  | 90 | 
|  | 91     def setVerbosityLevel( self, verbose ): | 
|  | 92         self._verbose = int(verbose) | 
|  | 93 | 
|  | 94     def checkAttributes( self ): | 
|  | 95         """ | 
|  | 96         Check the attributes are valid before running the algorithm. | 
|  | 97         """ | 
|  | 98         if self._inFileName == "": | 
|  | 99             print "ERROR: missing input file name" | 
|  | 100             self.help(); sys.exit(1) | 
|  | 101         if not os.path.exists( self._inFileName ): | 
|  | 102             print "ERROR: input file '%s' doesn't exist" % ( self._inFileName ) | 
|  | 103             self.help(); sys.exit(1) | 
|  | 104         if self._prgToOrient not in [ "orienter", "mummer" ]: | 
|  | 105             print "ERROR: unknown program '%s'" % ( self._prgToOrient ) | 
|  | 106             self.help(); sys.exit(1) | 
|  | 107         if self._outFileName == "": | 
|  | 108             self._outFileName = "%s.oriented" % ( self._inFileName ) | 
|  | 109 | 
|  | 110     def useOrienter( self ): | 
|  | 111         """ | 
|  | 112         Use 'orienter'. | 
|  | 113         @return: exit value of 'orienter' | 
|  | 114         """ | 
|  | 115         prg = "orienter" | 
|  | 116         cmd = prg | 
|  | 117         cmd += " -k" | 
|  | 118 #        cmd += " -l %i" % ( self._minMatchLength ) | 
|  | 119         cmd += " -v %i" % ( self._verbose ) | 
|  | 120         cmd += " %s" % ( self._inFileName ) | 
|  | 121         log = os.system( cmd ) | 
|  | 122         if log == 0 and self._outFileName.split(".")[-1] != "oriented": | 
|  | 123             os.rename( self._inFileName + ".oriented", self._outFileName ) | 
|  | 124         return log | 
|  | 125 | 
|  | 126     def compareInputSequencesWithMummer( self, nbInSeq ): | 
|  | 127         """ | 
|  | 128         Launch MUmmer on two single-sequence fasta files to find all maximal matches regardless of their uniqueness and record stdout. | 
|  | 129         Only N(N-1)/2 comparisons are made. | 
|  | 130         @param nbInSeq: number of input sequences | 
|  | 131         @type nbInSeq: integer | 
|  | 132         """ | 
|  | 133         if self._verbose > 0: | 
|  | 134             print "aligning input sequences..." | 
|  | 135             sys.stdout.flush() | 
|  | 136         if not CheckerUtils.isExecutableInUserPath( "mummer" ): | 
|  | 137             msg = "ERROR: 'mummer' is not in your PATH" | 
|  | 138             sys.stderr.write( "%s\n" % ( msg ) ) | 
|  | 139             sys.exit(1) | 
|  | 140         lInFiles = glob.glob( "batch_*.fa" ) | 
|  | 141         for i in range( 1, nbInSeq+1 ): | 
|  | 142             for j in range( i+1, nbInSeq+1 ): | 
|  | 143                 if self._verbose > 1: | 
|  | 144                     print "launch MUmmer on %i versus %i" % ( i, j ) | 
|  | 145                     sys.stdout.flush() | 
|  | 146                 prg = "mummer" | 
|  | 147                 cmd = prg | 
|  | 148                 cmd += " -maxmatch" | 
|  | 149                 cmd += " -l %i" % ( self._minMatchLength ) | 
|  | 150                 cmd += " -b" | 
|  | 151                 cmd += " -F" | 
|  | 152                 cmd += " batch_%s.fa" % ( str(j).zfill( len( str( len(lInFiles) ) ) ) ) | 
|  | 153                 cmd += " batch_%s.fa" % ( str(i).zfill( len( str( len(lInFiles) ) ) ) ) | 
|  | 154                 cmd += " > mummer_%i_vs_%i.txt" % ( i, j ) | 
|  | 155                 if self._verbose < 3: | 
|  | 156                     cmd += " 2> /dev/null" | 
|  | 157                 returnStatus = os.system( cmd ) | 
|  | 158                 if returnStatus != 0: | 
|  | 159                     msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus ) | 
|  | 160                     sys.stderr.write( "%s\n" % ( msg ) ) | 
|  | 161                     sys.exit(1) | 
|  | 162 | 
|  | 163     def parseMummerOutput( self, mummerFileName, nameSeq1, nameSeq2 ): | 
|  | 164         """ | 
|  | 165         Parse the output from MUmmer. | 
|  | 166         @param mummerFileName: file with the output from MUmmer | 
|  | 167         @type mummerFileName: string | 
|  | 168         @param nameSeq1: name of the first sequence in the pairwise comparison | 
|  | 169         @type nameSeq1: string | 
|  | 170         @param nameSeq2: name of the first sequence in the pairwise comparison | 
|  | 171         @type nameSeq2: string | 
|  | 172         @return: dictionary whose keys are strands and values the number of maximal matches | 
|  | 173         """ | 
|  | 174         if self._verbose > 1: | 
|  | 175             print "parse '%s'" % ( mummerFileName ) | 
|  | 176             sys.stdout.flush() | 
|  | 177         dStrand2NbMatches = {} | 
|  | 178         mummerF = open( mummerFileName, "r" ) | 
|  | 179         strand = "direct" | 
|  | 180         nbMatches = 0 | 
|  | 181         line = mummerF.readline() | 
|  | 182         while True: | 
|  | 183             if line == "": | 
|  | 184                 break | 
|  | 185             if line[0] == ">": | 
|  | 186                 if nameSeq1 not in line: | 
|  | 187                     print "ERROR" | 
|  | 188                     print nameSeq1 | 
|  | 189                     print nameSeq2 | 
|  | 190                     sys.exit(1) | 
|  | 191                 if "Reverse" in line: | 
|  | 192                     dStrand2NbMatches[ strand ] = nbMatches | 
|  | 193                     strand = "reverse" | 
|  | 194                     nbMatches = 0 | 
|  | 195             else: | 
|  | 196                 nbMatches += 1 | 
|  | 197             line = mummerF.readline() | 
|  | 198         dStrand2NbMatches[ strand ] = nbMatches | 
|  | 199         mummerF.close() | 
|  | 200         if self._verbose > 1: | 
|  | 201             print " direct: %i maximal matches" % ( dStrand2NbMatches["direct"] ) | 
|  | 202             print " reverse: %i maximal matches" % ( dStrand2NbMatches["reverse"] ) | 
|  | 203             sys.stdout.flush() | 
|  | 204         return dStrand2NbMatches | 
|  | 205 | 
|  | 206     def getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( self, lInHeaders, nbInSeq ): | 
|  | 207         """ | 
|  | 208         For each pairwise comparison, retrieve the number of matches on both strand. | 
|  | 209         @param lInHeaders: list of the sequence headers | 
|  | 210         @type lInHeaders: list of strings | 
|  | 211         @param nbInSeq: number of input sequences | 
|  | 212         @type nbInSeq: integer | 
|  | 213         @return: dictionary whose keys are pairwise comparisons and values are number of matches on both strands | 
|  | 214         """ | 
|  | 215         dMatrix = {} | 
|  | 216         for i in range( 1, nbInSeq+1 ): | 
|  | 217             for j in range( i+1, nbInSeq+1 ): | 
|  | 218                 pairComp = "%i_vs_%i" % ( i, j ) | 
|  | 219                 dStrand2NbMatches = self.parseMummerOutput( "mummer_%s.txt" % ( pairComp ), lInHeaders[i-1], lInHeaders[j-1] ) | 
|  | 220                 dMatrix[ pairComp ] = dStrand2NbMatches | 
|  | 221         return dMatrix | 
|  | 222 | 
|  | 223     def showResultsAsOrienter( self, nbInSeq, dMatrix ): | 
|  | 224         """ | 
|  | 225         Not used for the moment but can be useful... | 
|  | 226         """ | 
|  | 227         for i in range( 1, nbInSeq+1 ): | 
|  | 228             for j in range( i+1, nbInSeq+1 ): | 
|  | 229                 pairComp = "%i_vs_%i" % ( i, j ) | 
|  | 230                 string = "aligning %i with %i" % ( i, j ) | 
|  | 231                 string += " %i %i" % ( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) | 
|  | 232                 string += " max=%i" % ( max( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) ) | 
|  | 233                 if dMatrix[pairComp]["reverse"] > dMatrix[pairComp]["direct"]: | 
|  | 234                     string += " strand=-" | 
|  | 235                     string += " nb=%i" % ( dMatrix[pairComp]["reverse"] ) | 
|  | 236                 else: | 
|  | 237                     string += " strand=+" | 
|  | 238                     string += " nb=%i" % ( dMatrix[pairComp]["direct"] ) | 
|  | 239                 print string; sys.stdout.flush() | 
|  | 240 | 
|  | 241     def getSequencesToReverseFromMatrix( self, dMatrix, lNewHeaders ): | 
|  | 242         """ | 
|  | 243         Retrieve the sequences than need to be re-oriented. | 
|  | 244         @param dMatrix: | 
|  | 245         @type dMatrix: | 
|  | 246         @param lInHeaders: list of the new sequence headers | 
|  | 247         @type lInHeaders: list of strings | 
|  | 248         @return: list of headers corresponding to sequences than need to be re-oriented | 
|  | 249         """ | 
|  | 250         lSeqToOrient = [] | 
|  | 251 | 
|  | 252         for i in range( 1, len(lNewHeaders)+1 ): | 
|  | 253             for j in range( i+1, len(lNewHeaders)+1 ): | 
|  | 254                 comp = "%i_vs_%i" % ( i, j ) | 
|  | 255                 nbDirectMatches = dMatrix[ comp ][ "direct" ] | 
|  | 256                 nbReverseMatches = dMatrix[ comp ][ "reverse" ] | 
|  | 257                 if self._verbose > 1: | 
|  | 258                     print "%s: direct=%i reverse=%i" % ( comp, nbDirectMatches, nbReverseMatches ) | 
|  | 259                 if nbReverseMatches > nbDirectMatches and lNewHeaders[i-1] not in lSeqToOrient: | 
|  | 260                     if lNewHeaders[ j-1 ] not in lSeqToOrient: | 
|  | 261                         if self._verbose > 2: | 
|  | 262                             "need to reverse '%s'" % ( lNewHeaders[j-1] ) | 
|  | 263                         lSeqToOrient.append( lNewHeaders[ j-1 ] ) | 
|  | 264         return lSeqToOrient | 
|  | 265 | 
|  | 266 | 
|  | 267     def getSequencesToReverse( self ): | 
|  | 268         """ | 
|  | 269         Return a list with the headers of the sequences to reverse. | 
|  | 270         """ | 
|  | 271         lSequenceHeadersToReverse = [] | 
|  | 272 | 
|  | 273         if self._prgToOrient == "orienter": | 
|  | 274             exitStatus = self.useOrienter() | 
|  | 275             if exitStatus == 0: | 
|  | 276                 self.end() | 
|  | 277                 sys.exit(0) | 
|  | 278             if exitStatus != 0: | 
|  | 279                 print "\nWARNING: 'orienter' had a problem, switching to 'mummer'" | 
|  | 280                 sys.stdout.flush() | 
|  | 281 | 
|  | 282         lInHeaders = pyRepet.seq.fastaDB.dbHeaders( self._inFileName ) | 
|  | 283         nbInSeq = len( lInHeaders ) | 
|  | 284         if self._verbose > 0: | 
|  | 285             print "nb of input sequences: %i" % ( nbInSeq ) | 
|  | 286             sys.stdout.flush() | 
|  | 287 | 
|  | 288         pyRepet.seq.fastaDB.shortenSeqHeaders( self._inFileName, 1 ) | 
|  | 289         tmpFileName = "%s.shortH" % ( self._inFileName ) | 
|  | 290         lNewHeaders = pyRepet.seq.fastaDB.dbHeaders( tmpFileName ) | 
|  | 291         dNew2Init = pyRepet.seq.fastaDB.retrieveLinksNewInitialHeaders( "%slink" % ( tmpFileName ) ) | 
|  | 292 | 
|  | 293         pyRepet.seq.fastaDB.dbSplit( tmpFileName, nbSeqPerBatch=1, newDir=True ) | 
|  | 294         os.chdir( "batches" ) | 
|  | 295         self.compareInputSequencesWithMummer( nbInSeq ) | 
|  | 296         dMatrix = self.getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( lNewHeaders, nbInSeq ) | 
|  | 297         os.chdir( ".." ) | 
|  | 298 | 
|  | 299         lNewHeadersToReverse = self.getSequencesToReverseFromMatrix( dMatrix, lNewHeaders ) | 
|  | 300         for newH in lNewHeadersToReverse: | 
|  | 301             lSequenceHeadersToReverse.append( dNew2Init[ newH ] ) | 
|  | 302         if self._verbose > 0: | 
|  | 303             print "nb of sequences to reverse: %i" % ( len(lNewHeadersToReverse) ) | 
|  | 304             for initH in lSequenceHeadersToReverse: print " %s" % ( initH ) | 
|  | 305             sys.stdout.flush() | 
|  | 306 | 
|  | 307         if self._clean: | 
|  | 308             os.remove( tmpFileName ) | 
|  | 309             os.remove( "%slink" % ( tmpFileName ) ) | 
|  | 310 | 
|  | 311         return lSequenceHeadersToReverse | 
|  | 312 | 
|  | 313     def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ): | 
|  | 314         """ | 
|  | 315         Save input sequences while re-orienting those needing it. | 
|  | 316         @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented | 
|  | 317         @type lSequenceHeadersToReverse: list of strings | 
|  | 318         @param tmpFileName: name of a fasta file (inFileName by default) | 
|  | 319         @type tmpFileName: string | 
|  | 320         """ | 
|  | 321         if self._verbose > 0: | 
|  | 322             print "saving oriented sequences..." | 
|  | 323             sys.stdout.flush() | 
|  | 324         if tmpFileName == "": | 
|  | 325             tmpFileName = self._inFileName | 
|  | 326         inDB = BioseqDB( tmpFileName ) | 
|  | 327         outDB = BioseqDB() | 
|  | 328         for bs in inDB.db: | 
|  | 329             if bs.header in lSequenceHeadersToReverse: | 
|  | 330                 bs.reverseComplement() | 
|  | 331                 bs.header += " re-oriented" | 
|  | 332             outDB.add( bs ) | 
|  | 333         outDB.save( self._outFileName ) | 
|  | 334 | 
|  | 335     def clean( self ): | 
|  | 336         if os.path.exists( "batches" ): | 
|  | 337             os.system( "rm -rf batches" ) | 
|  | 338         if os.path.exists( "orienter_error.log" ): | 
|  | 339             os.remove( "orienter_error.log" ) | 
|  | 340         for f in glob.glob( "core.*" ): | 
|  | 341             os.remove( f ) | 
|  | 342 | 
|  | 343     def start( self ): | 
|  | 344         """ | 
|  | 345         Useful commands before running the program. | 
|  | 346         """ | 
|  | 347         self.checkAttributes() | 
|  | 348         if self._verbose > 0: | 
|  | 349             print "START %s" % ( type(self).__name__ ) | 
|  | 350             print "input file: %s" % ( self._inFileName ) | 
|  | 351             sys.stdout.flush() | 
|  | 352 | 
|  | 353     def end( self ): | 
|  | 354         """ | 
|  | 355         Useful commands before ending the program. | 
|  | 356         """ | 
|  | 357         if self._clean: | 
|  | 358             self.clean() | 
|  | 359         if self._verbose > 0: | 
|  | 360             print "END %s" % ( type(self).__name__ ) | 
|  | 361             sys.stdout.flush() | 
|  | 362 | 
|  | 363     def run( self ): | 
|  | 364         """ | 
|  | 365         Run the program. | 
|  | 366         """ | 
|  | 367         self.start() | 
|  | 368         lSequenceHeadersToReverse = self.getSequencesToReverse() | 
|  | 369         self.orientInputSequences( lSequenceHeadersToReverse ) | 
|  | 370         self.end() | 
|  | 371 | 
|  | 372 if __name__ == "__main__": | 
|  | 373     i = OrientSequences() | 
|  | 374     i.setAttributesFromCmdLine() | 
|  | 375     i.run() |