view commons/tools/refalign2fasta.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

##@file
# Convert the output from Refalign (MSA program) into the 'fasta' format.
# Usually used before subsequent analysis such as the estimation of deletion rate.
#
# usage: refalign2fasta.py [ options ]
# options:
#      -h: this help
#      -i: name of the input file (output from refalign)
#      -r: name of the reference sequence (discard if not provided)
#      -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')
#      -o: name of the output file (default=inFileName'.fa_aln',format='fasta')

import os
import sys
import getopt
import exceptions

if not os.environ.has_key( "REPET_PATH" ):
    print "ERROR: no environment variable REPET_PATH"
    sys.exit(1)
sys.path.append( os.environ["REPET_PATH"] )

import pyRepet.seq.Bioseq


def help():
    """
    Give the list of the command-line options.
    """
    print
    print "usage:",sys.argv[0]," [ options ]"
    print "options:"
    print "     -h: this help"
    print "     -i: name of the input file (output from refalign)"
    print "     -r: name of the reference sequence (discard if not provided)"
    print "     -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')"
    print "     -o: name of the output file (default=inFileName'.fa_aln',format='fasta')"
    print


def getAlignments( inFileName ):
    """
    Retrieve the alignments from the input file.

    @param inFileName: name of the input file
    @type: string

    @return: list of alignments ( refseq, seq, header of seq )
    @rtype: list of 3d-tuples
    """

    lAlign = []

    inFile = open( inFileName, "r" )
    line = inFile.readline()
    while True:
        if line == "":
            break
        refseq, seq, label = line[:-1].split("\t")[:3]
        lAlign.append( ( refseq, seq, label ) )
        line = inFile.readline()
    inFile.close()

    return lAlign


def getGaps( seq ):
    """
    Get the gaps on a sequence, start coordinate and length. The start
    coordinate of a gap is the # of the nucleotide after which it starts.

    @param seq: sequence to analyse
    @type seq: string

    @return: list of gaps ( start coordinate, length )
    @rtype: list of 2d-tuples
    """

    prev = "N"
    lGapsOnSeq = []
    i = 0
    lengthGap = 0
    for c in seq:
        if c == "-" and prev != "-":
            startGap = i
        if c != "-" and prev == "-":
            lGapsOnSeq.append( ( startGap, lengthGap ) )
            lengthGap = 0
        if c != "-":
            i += 1
        else:
            lengthGap += 1
        prev = c

     # case with a gap at the end of the sequence
    if seq[ len(seq) - 1 ] == "-":
        lGapsOnSeq.append( ( startGap, lengthGap ) )

    return lGapsOnSeq


def getGapsOnRefSeq( lAlign ):
    """
    Retrieve the gaps on the ref seq in all the alignments.

    @param lAlign: list of alignments ( refseq, seq, header of seq )
    @type lAlign: list of 3d-tuples

    @return: list of gaps per alignment
    @rtype: list of lists of 2d-tuples
    """

    lGapsOnRef = []

    for align in lAlign:
        refseq = align[0]
        lGapsOnRef.append( getGaps( refseq ) )

    return lGapsOnRef


def insertGap( seq, startGap, lengthGap ):
    """
    Get a new seq by inserting a gap in the give seq.

    @param seq: sequence
    @type seq: string

    @param startGap:
    @type: startGap: integer

    @param lengthGap: length of the gap
    @type lengthGap: integer

    @return: new seq made from the give seq by inserting the gap
    @rtype: string
    """

    new_seq = seq[:startGap] + (lengthGap*'-') + seq[startGap:] 
    return new_seq


def insertListGaps( inSeq, lGaps ):
    """
    Insert all the gaps from the list into the sequence.

    @param inSeq: sequence
    @type inSeq: string

    @param lGaps: list of gaps ( start coordinate, length )
    @type: list of 2d-tuples

    @return: sequence with the gaps
    @rtype: string
    """

    # insert gaps from the right to the left
    lGaps.sort()
    lGaps.reverse()

    prevPos = 0
    outSeq = inSeq

    for startGap, lengthGap in lGaps:
        if startGap != prevPos:
            outSeq = insertGap( outSeq, startGap, lengthGap )
            prevPos = startGap

    return outSeq


def insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName ):
    """
    Get the ref seq with all its gaps inserted.

    @param lAlign: list of alignments ( refseq, seq, header of seq )
    @type lAlign: list of 3d-tuples

    @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
    @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples

    @param refseqName: name of the reference sequence
    @type refseqName: string

    @return: ref seq with all its gaps inserted
    @rtype: string
    """

    # retrieve the initial ref seq, ie without any gap
    initRefSeq = lAlign[0][0].replace("-","")

    # convert the list of lists of gaps into a list of gaps
    flat_lGaps = []
    for gaps in lGapsOnRefSeqPerAlign:
        flat_lGaps.extend( gaps )

    # insert the gaps in the sequence of ref seq
    finalRefSeq = insertListGaps( initRefSeq, flat_lGaps )

    Align_refseq = ( refseqName, finalRefSeq )

    return Align_refseq


def insertgap_seq( lAlign, lGapsOnRefSeqPerAlign ):
    """
    Insert in the sequences all the gaps found on the ref seq.

    @param lAlign: list of alignments ( refseq, seq, header of seq )
    @type lAlign: list of 3d-tuples

    @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
    @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples

    @return: list of lists (sequence with gaps, header)
    @rtype: list of 2d-tuples

    insert dans les seq les gaps donnes par la liste de liste de gap
    retourne une liste des nouvelles seq apres insertion des gaps
    """

    # for each gap, add the nb of the alignment in which it has been found
    newlistgap_seq =[]
    alignID = 0
    for lGaps in lGapsOnRefSeqPerAlign:
        for startGap,lengthGap in lGaps:
            newlistgap_seq.append( ( startGap, lengthGap, alignID ) )
        alignID += 1
    newlistgap_seq.sort()

    Align_seq = []

    # for each alignment
    for j in xrange(0,len(lAlign)):

        #create a new list = list of gaps to be inserted in a given seq
        newlist = []
        offset = 0
        longmax = 0
        longself = 0
        posprec = 0
        for startGap, lengthGap, alignID in newlistgap_seq:
            # when 2 gaps have the same start, we keep the longer one
            if startGap != posprec:
                if longmax > longself:
                    newlist.append( (posprec + offset, longmax - longself) )
                offset += longself
                longmax=0
                longself=0
            #lorsque le numero de la seq du tuple a la meme valeur que j
            #on stocke la valeur de lengthGap
            if j == alignID:
                longself = lengthGap
            #sinon on prend comme valeur de longmax la valeur maximale de longmax et lengthGap
            else:
                longmax = max(longmax, lengthGap)
            posprec = startGap
        if longmax > longself:
            newlist.append((posprec + offset, longmax - longself))

        newSeq = insertListGaps( (lAlign[j][1]), newlist )
        header = lAlign[j][2]
        Align_seq.append( ( header, newSeq ) )

    return Align_seq


def getSeqWithDeletions( lAlign ):
    """
    Get the sequences by putting gaps only when they correspond to a deletion compare to ref seq.
    Used for instance when we want to estimate the deletion rate.

    @param lAlign: list of alignments ( refseq, seq, header of seq )
    @type lAlign: list of 3d-tuples

    @return: list of lists ( header, sequence with gaps )
    @rtype: list of 2d-tuples
    """

    Align_seq = []

    for align in lAlign:
        refseq = align[0]
        seq = align[1]
        header = align[2]
        newSeq = ""
        for i in xrange(0,len(refseq)):
            if refseq[i] != "-":
                newSeq += seq[i]
        Align_seq.append( ( header, newSeq ) )

    return Align_seq


def saveMSA( outFileName, Align_seq, Align_seqref=None ):
    """
    Save the results as a multiple sequence alignment (MSA) in the 'fasta' format.

    @param outFileName: name of the output file
    @type outFileName: string

    @param Align_seqref: sequence of ref seq
    @type Align_seqref: string
    """

    outFile = open( outFileName, "w" )
    bs = pyRepet.seq.Bioseq.Bioseq()

    # if provided, save the ref seq
    if Align_seqref != None:
        bs.header = Align_seqref[0]
        bs.sequence = Align_seqref[1]
        bs.write( outFile )

    # save the other sequences
    for i in Align_seq:
        bs.header = i[0]
        bs.sequence = i[1]
        bs.write( outFile )

    outFile.close()
    
    
def saveOnlyWithDeletions( lAlign, refseqName, outFileName ):
    Align_seq = getSeqWithDeletions( lAlign )
    if refseqName != "":
        Align_seqref = ( refseqName, lAlign[0][0].replace("-","") )
        saveMSA( outFileName, Align_seq, Align_seqref )
    else:
        saveMSA( outFileName, Align_seq )
        
        
def main():
    
    inFileName = ""
    refseqName = ""
    keepGap = "id"
    outFileName = ""
    global verbose
    verbose = 0
    
    try:
        opts, args = getopt.getopt(sys.argv[1:],"hi:r:g:o:v:")
    except getopt.GetoptError, err:
        print str(err)
        help()
        sys.exit(1)
    for o,a in opts:
        if o == "-h":
            help()
            sys.exit(0)
        elif o == "-i":
            inFileName = a
        elif o == "-r":
            refseqName = a
        elif o == "-g":
            keepGap = a
        elif o == "-o":
            outFileName = a
        elif o == "-v":
            verbose = int(a)

    if inFileName == "":
        print "ERROR: missing input file name"
        help()
        sys.exit(1)
        
    if verbose > 0:
        print "START %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()
        
    lAlign = getAlignments( inFileName )
    if verbose > 0:
        print "nb of alignments: %i" % ( len(lAlign) )
        sys.stdout.flush()
        
    if outFileName == "":
        outFileName = "%s.fa_aln" % ( inFileName )
    if verbose > 0:
        print "output file: '%s'" % ( outFileName )
        
    if keepGap == "id":
        lGapsOnRefSeqPerAlign = getGapsOnRefSeq( lAlign )
        Align_seq = insertgap_seq( lAlign, lGapsOnRefSeqPerAlign )
        if refseqName != "":
            Align_seqref = insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName )
            saveMSA( outFileName, Align_seq, Align_seqref )
        else:
            saveMSA( outFileName, Align_seq )
            
    elif keepGap == "d":
        saveOnlyWithDeletions( lAlign, refseqName, outFileName )
        
    elif keepGap == "i":
        print "ERROR: '-g i' not yet available"
        sys.exit(1)
        
    if verbose > 0:
        print "END %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()
        
    return 0


if __name__ == "__main__" :
    main ()