Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/commons/core/sql/TableSeqAdaptator.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/commons/core/sql/TableSeqAdaptator.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,185 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import sys +from commons.core.sql.TableAdaptator import TableAdaptator +from commons.core.sql.ITableSeqAdaptator import ITableSeqAdaptator +from commons.core.coord.SetUtils import SetUtils +from commons.core.seq.Bioseq import Bioseq + + +## Adaptator for a Seq table +# +class TableSeqAdaptator( TableAdaptator, ITableSeqAdaptator ): + + ## Retrieve all the distinct accession names in a list. + # + # @return lAccessions list of accessions + # + def getAccessionsList( self ): + sqlCmd = "SELECT DISTINCT accession FROM %s;" % ( self._table ) + lAccessions = self._getStringListWithSQLCmd(sqlCmd) + return lAccessions + + ## Save sequences in a fasta file from a list of accession names. + # + # @param lAccessions list of accessions + # @param outFileName string Fasta file + # + def saveAccessionsListInFastaFile( self, lAccessions, outFileName ): + outFile = open( outFileName, "w" ) + for ac in lAccessions: + bs = self.getBioseqFromHeader( ac ) + bs.write(outFile) + outFile.close() + + ## Get a bioseq instance given its header + # + # @param header string name of the sequence ('accession' field in the 'seq' table) + # @return bioseq instance + # + def getBioseqFromHeader( self, header ): + sqlCmd = "SELECT * FROM %s WHERE accession='%s';" % ( self._table, header ) + self._iDb.execute( sqlCmd ) + res = self._iDb.fetchall() + return Bioseq( res[0][0], res[0][1] ) + + ## Retrieve the length of a sequence given its name. + # + # @param accession name of the sequence + # @return seqLength integer length of the sequence + # + def getSeqLengthFromAccession( self, accession ): + sqlCmd = 'SELECT length FROM %s WHERE accession="%s"' % ( self._table, accession ) + seqLength = self._iDb.getIntegerWithSQLCmd(sqlCmd) + return seqLength + + ## Retrieve the length of a sequence given its description. + # + # @param description of the sequence + # @return seqLength integer length of the sequence + # + def getSeqLengthFromDescription( self, description ): + sqlCmd = 'SELECT length FROM %s WHERE description="%s"' % ( self._table, description ) + seqLength = self._iDb.getIntegerWithSQLCmd(sqlCmd) + return seqLength + + ## Retrieve all the accessions with length in a list of tuples + # + # @return lAccessionLengthTuples list of tuples + # + def getAccessionAndLengthList(self): + sqlCmd = 'SELECT accession, length FROM %s' % self._table + self._iDb.execute(sqlCmd) + res = self._iDb.fetchall() + lAccessionLengthTuples = [] + for i in res: + lAccessionLengthTuples.append(i) + return lAccessionLengthTuples + + ## get subsequence according to given parameters + # + # @param accession + # @param start integer + # @param end integer + # @return bioseq.sequence string + # + def getSubSequence( self, accession, start, end ): + bs = Bioseq() + if start <= 0 or end <= 0: + print "ERROR with coordinates start=%i or end=%i" % ( start, end ) + sys.exit(1) + + if accession not in self.getAccessionsList(): + print "ERROR: accession '%s' absent from table '%s'" % ( accession, self._table ) + sys.exit(1) + + lengthAccession = self.getSeqLengthFromAccession( accession ) + if start > lengthAccession or end > lengthAccession: + print "ERROR: coordinates start=%i end=%i out of sequence '%s' range (%i bp)" % ( start, end, accession, lengthAccession ) + sys.exit(1) + + sqlCmd = "SELECT SUBSTRING(sequence,%i,%i) FROM %s WHERE accession='%s'" % ( min(start,end), abs(end-start)+ 1, self._table, accession ) + self._iDb.execute( sqlCmd ) + res = self._iDb.fetchall() + bs.setSequence( res[0][0] ) + if start > end: + bs.reverseComplement() + return bs.sequence + + ## get bioseq from given set list + # + # @param lSets set list of sets + # @return bioseq instance + # + def getBioseqFromSetList( self, lSets ): + header = "%s::%i %s " % ( lSets[0].name, lSets[0].id, lSets[0].seqname ) + sequence = "" + lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets ) + if not lSets[0].isOnDirectStrand(): + lSortedSets.reverse() + for iSet in lSortedSets: + header += "%i..%i," % ( iSet.getStart(), iSet.getEnd() ) + sequence += self.getSubSequence( iSet.seqname, iSet.getStart(), iSet.getEnd() ) + return Bioseq( header[:-1], sequence ) + + ## Return True if the given accession is present in the table + # + def isAccessionInTable( self, name ): + sqlCmd = "SELECT accession FROM %s WHERE accession='%s'" % ( self._table, name ) + self._iDb.execute( sqlCmd ) + res = self._iDb.fetchall() + return bool(res) + + ## Retrieve all the distinct accession names in a fasta file. + # + # @param outFileName string Fasta file + # + def exportInFastaFile(self, outFileName ): + lAccessions = self.getAccessionsList() + self.saveAccessionsListInFastaFile( lAccessions, outFileName ) + + def _getStringListWithSQLCmd( self, sqlCmd ): + self._iDb.execute(sqlCmd) + res = self._iDb.fetchall() + lString = [] + for i in res: + lString.append(i[0]) + return lString + + def _getTypeAndAttr2Insert(self, bs): + type2Insert = ( "'%s'", "'%s'", "'%s'", "'%i'" ) + attr2Insert = (bs.header.split()[0], bs.sequence, bs.header, bs.getLength()) + return type2Insert, attr2Insert + + def _escapeAntislash(self, obj): + pass +