s_mart: commons/tools/refalign2fasta.py comparison

comparison commons/tools/refalign2fasta.py @ 18:94ab73e8a190

Uploaded

author	m-zytnicki
date	Mon, 29 Apr 2013 03:20:15 -0400
parents
children

comparison

equal deleted inserted replaced

-:b0e8584489e6
+:94ab73e8a190
+#!/usr/bin/env python
+##@file
+# Convert the output from Refalign (MSA program) into the 'fasta' format.
+# Usually used before subsequent analysis such as the estimation of deletion rate.
+#
+# usage: refalign2fasta.py [ options ]
+# options:
+#      -h: this help
+#      -i: name of the input file (output from refalign)
+#      -r: name of the reference sequence (discard if not provided)
+#      -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')
+#      -o: name of the output file (default=inFileName'.fa_aln',format='fasta')
+import os
+import sys
+import getopt
+import exceptions
+if not os.environ.has_key( "REPET_PATH" ):
+print "ERROR: no environment variable REPET_PATH"
+sys.exit(1)
+sys.path.append( os.environ["REPET_PATH"] )
+import pyRepet.seq.Bioseq
+def help():
+"""
+Give the list of the command-line options.
+"""
+print
+print "usage:",sys.argv[0]," [ options ]"
+print "options:"
+print "     -h: this help"
+print "     -i: name of the input file (output from refalign)"
+print "     -r: name of the reference sequence (discard if not provided)"
+print "     -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')"
+print "     -o: name of the output file (default=inFileName'.fa_aln',format='fasta')"
+print
+def getAlignments( inFileName ):
+"""
+Retrieve the alignments from the input file.
+@param inFileName: name of the input file
+@type: string
+@return: list of alignments ( refseq, seq, header of seq )
+@rtype: list of 3d-tuples
+"""
+lAlign = []
+inFile = open( inFileName, "r" )
+line = inFile.readline()
+while True:
+if line == "":
+break
+refseq, seq, label = line[:-1].split("\t")[:3]
+lAlign.append( ( refseq, seq, label ) )
+line = inFile.readline()
+inFile.close()
+return lAlign
+def getGaps( seq ):
+"""
+Get the gaps on a sequence, start coordinate and length. The start
+coordinate of a gap is the # of the nucleotide after which it starts.
+@param seq: sequence to analyse
+@type seq: string
+@return: list of gaps ( start coordinate, length )
+@rtype: list of 2d-tuples
+"""
+prev = "N"
+lGapsOnSeq = []
+i = 0
+lengthGap = 0
+for c in seq:
+if c == "-" and prev != "-":
+startGap = i
+if c != "-" and prev == "-":
+lGapsOnSeq.append( ( startGap, lengthGap ) )
+lengthGap = 0
+if c != "-":
+i += 1
+else:
+lengthGap += 1
+prev = c
+# case with a gap at the end of the sequence
+if seq[ len(seq) - 1 ] == "-":
+lGapsOnSeq.append( ( startGap, lengthGap ) )
+return lGapsOnSeq
+def getGapsOnRefSeq( lAlign ):
+"""
+Retrieve the gaps on the ref seq in all the alignments.
+@param lAlign: list of alignments ( refseq, seq, header of seq )
+@type lAlign: list of 3d-tuples
+@return: list of gaps per alignment
+@rtype: list of lists of 2d-tuples
+"""
+lGapsOnRef = []
+for align in lAlign:
+refseq = align[0]
+lGapsOnRef.append( getGaps( refseq ) )
+return lGapsOnRef
+def insertGap( seq, startGap, lengthGap ):
+"""
+Get a new seq by inserting a gap in the give seq.
+@param seq: sequence
+@type seq: string
+@param startGap:
+@type: startGap: integer
+@param lengthGap: length of the gap
+@type lengthGap: integer
+@return: new seq made from the give seq by inserting the gap
+@rtype: string
+"""
+new_seq = seq[:startGap] + (lengthGap*'-') + seq[startGap:]
+return new_seq
+def insertListGaps( inSeq, lGaps ):
+"""
+Insert all the gaps from the list into the sequence.
+@param inSeq: sequence
+@type inSeq: string
+@param lGaps: list of gaps ( start coordinate, length )
+@type: list of 2d-tuples
+@return: sequence with the gaps
+@rtype: string
+"""
+# insert gaps from the right to the left
+lGaps.sort()
+lGaps.reverse()
+prevPos = 0
+outSeq = inSeq
+for startGap, lengthGap in lGaps:
+if startGap != prevPos:
+outSeq = insertGap( outSeq, startGap, lengthGap )
+prevPos = startGap
+return outSeq
+def insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName ):
+"""
+Get the ref seq with all its gaps inserted.
+@param lAlign: list of alignments ( refseq, seq, header of seq )
+@type lAlign: list of 3d-tuples
+@param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
+@type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
+@param refseqName: name of the reference sequence
+@type refseqName: string
+@return: ref seq with all its gaps inserted
+@rtype: string
+"""
+# retrieve the initial ref seq, ie without any gap
+initRefSeq = lAlign[0][0].replace("-","")
+# convert the list of lists of gaps into a list of gaps
+flat_lGaps = []
+for gaps in lGapsOnRefSeqPerAlign:
+flat_lGaps.extend( gaps )
+# insert the gaps in the sequence of ref seq
+finalRefSeq = insertListGaps( initRefSeq, flat_lGaps )
+Align_refseq = ( refseqName, finalRefSeq )
+return Align_refseq
+def insertgap_seq( lAlign, lGapsOnRefSeqPerAlign ):
+"""
+Insert in the sequences all the gaps found on the ref seq.
+@param lAlign: list of alignments ( refseq, seq, header of seq )
+@type lAlign: list of 3d-tuples
+@param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
+@type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
+@return: list of lists (sequence with gaps, header)
+@rtype: list of 2d-tuples
+insert dans les seq les gaps donnes par la liste de liste de gap
+retourne une liste des nouvelles seq apres insertion des gaps
+"""
+# for each gap, add the nb of the alignment in which it has been found
+newlistgap_seq =[]
+alignID = 0
+for lGaps in lGapsOnRefSeqPerAlign:
+for startGap,lengthGap in lGaps:
+newlistgap_seq.append( ( startGap, lengthGap, alignID ) )
+alignID += 1
+newlistgap_seq.sort()
+Align_seq = []
+# for each alignment
+for j in xrange(0,len(lAlign)):
+#create a new list = list of gaps to be inserted in a given seq
+newlist = []
+offset = 0
+longmax = 0
+longself = 0
+posprec = 0
+for startGap, lengthGap, alignID in newlistgap_seq:
+# when 2 gaps have the same start, we keep the longer one
+if startGap != posprec:
+if longmax > longself:
+newlist.append( (posprec + offset, longmax - longself) )
+offset += longself
+longmax=0
+longself=0
+#lorsque le numero de la seq du tuple a la meme valeur que j
+#on stocke la valeur de lengthGap
+if j == alignID:
+longself = lengthGap
+#sinon on prend comme valeur de longmax la valeur maximale de longmax et lengthGap
+else:
+longmax = max(longmax, lengthGap)
+posprec = startGap
+if longmax > longself:
+newlist.append((posprec + offset, longmax - longself))
+newSeq = insertListGaps( (lAlign[j][1]), newlist )
+header = lAlign[j][2]
+Align_seq.append( ( header, newSeq ) )
+return Align_seq
+def getSeqWithDeletions( lAlign ):
+"""
+Get the sequences by putting gaps only when they correspond to a deletion compare to ref seq.
+Used for instance when we want to estimate the deletion rate.
+@param lAlign: list of alignments ( refseq, seq, header of seq )
+@type lAlign: list of 3d-tuples
+@return: list of lists ( header, sequence with gaps )
+@rtype: list of 2d-tuples
+"""
+Align_seq = []
+for align in lAlign:
+refseq = align[0]
+seq = align[1]
+header = align[2]
+newSeq = ""
+for i in xrange(0,len(refseq)):
+if refseq[i] != "-":
+newSeq += seq[i]
+Align_seq.append( ( header, newSeq ) )
+return Align_seq
+def saveMSA( outFileName, Align_seq, Align_seqref=None ):
+"""
+Save the results as a multiple sequence alignment (MSA) in the 'fasta' format.
+@param outFileName: name of the output file
+@type outFileName: string
+@param Align_seqref: sequence of ref seq
+@type Align_seqref: string
+"""
+outFile = open( outFileName, "w" )
+bs = pyRepet.seq.Bioseq.Bioseq()
+# if provided, save the ref seq
+if Align_seqref != None:
+bs.header = Align_seqref[0]
+bs.sequence = Align_seqref[1]
+bs.write( outFile )
+# save the other sequences
+for i in Align_seq:
+bs.header = i[0]
+bs.sequence = i[1]
+bs.write( outFile )
+outFile.close()
+def saveOnlyWithDeletions( lAlign, refseqName, outFileName ):
+Align_seq = getSeqWithDeletions( lAlign )
+if refseqName != "":
+Align_seqref = ( refseqName, lAlign[0][0].replace("-","") )
+saveMSA( outFileName, Align_seq, Align_seqref )
+else:
+saveMSA( outFileName, Align_seq )
+def main():
+inFileName = ""
+refseqName = ""
+keepGap = "id"
+outFileName = ""
+global verbose
+verbose = 0
+try:
+opts, args = getopt.getopt(sys.argv[1:],"hi:r:g:o:v:")
+except getopt.GetoptError, err:
+print str(err)
+help()
+sys.exit(1)
+for o,a in opts:
+if o == "-h":
+help()
+sys.exit(0)
+elif o == "-i":
+inFileName = a
+elif o == "-r":
+refseqName = a
+elif o == "-g":
+keepGap = a
+elif o == "-o":
+outFileName = a
+elif o == "-v":
+verbose = int(a)
+if inFileName == "":
+print "ERROR: missing input file name"
+help()
+sys.exit(1)
+if verbose > 0:
+print "START %s" % (sys.argv[0].split("/")[-1])
+sys.stdout.flush()
+lAlign = getAlignments( inFileName )
+if verbose > 0:
+print "nb of alignments: %i" % ( len(lAlign) )
+sys.stdout.flush()
+if outFileName == "":
+outFileName = "%s.fa_aln" % ( inFileName )
+if verbose > 0:
+print "output file: '%s'" % ( outFileName )
+if keepGap == "id":
+lGapsOnRefSeqPerAlign = getGapsOnRefSeq( lAlign )
+Align_seq = insertgap_seq( lAlign, lGapsOnRefSeqPerAlign )
+if refseqName != "":
+Align_seqref = insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName )
+saveMSA( outFileName, Align_seq, Align_seqref )
+else:
+saveMSA( outFileName, Align_seq )
+elif keepGap == "d":
+saveOnlyWithDeletions( lAlign, refseqName, outFileName )
+elif keepGap == "i":
+print "ERROR: '-g i' not yet available"
+sys.exit(1)
+if verbose > 0:
+print "END %s" % (sys.argv[0].split("/")[-1])
+sys.stdout.flush()
+return 0
+if __name__ == "__main__" :
+main ()

Mercurial > repos > yufei-luo > s_mart

comparison commons/tools/refalign2fasta.py @ 18:94ab73e8a190