view getIndels.py @ 1:f1dc6f5fce6e default tip

Corrected version string.
author devteam <devteam@galaxyproject.org>
date Thu, 10 Apr 2014 13:47:30 -0400
parents 91655316fcf0
children
line wrap: on
line source

#!/usr/bin/env python

"""
Estimate INDELs for pair-wise alignments.

usage: %prog maf_input out_file1 out_file2
"""

from __future__ import division
import sys
from bx.cookbook import doc_optparse
import bx.align.maf

assert sys.version_info[:2] >= ( 2, 4 )

def main():
    # Parsing Command Line here
    options, args = doc_optparse.parse( __doc__ )
    
    try:
        inp_file, out_file1 = args
    except:
        print >> sys.stderr, "Tool initialization error."
        sys.exit()
    
    try:
        open(inp_file, 'r')
    except:
        print >> sys.stderr, "Unable to open input file"
        sys.exit()
    try:
        fout1 = open(out_file1, 'w')
        #fout2 = open(out_file2, 'w')
    except:
        print >> sys.stderr, "Unable to open output file"
        sys.exit()

    try:
        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
    except:
        print >> sys.stderr, "Your MAF file appears to be malformed."
        sys.exit()
    
    print >> fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
    for block_ind, block in enumerate(maf_reader):
        if len(block.components) < 2:
            continue
        seq1 = block.components[0].text
        src1 = block.components[0].src
        start1 = block.components[0].start
        if len(block.components) == 2:
            seq2 = block.components[1].text
            src2 = block.components[1].src
            start2 = block.components[1].start
            #for pos in range(len(seq1)):
            nt_pos1 = start1-1    #position of the nucleotide (without counting gaps)
            nt_pos2 = start2-1
            pos = 0        #character column position
            gaplen1 = 0
            gaplen2 = 0
            prev_pos_gap1 = 0
            prev_pos_gap2 = 0
            while pos < len(seq1):
                if prev_pos_gap1 == 0:
                    gaplen1 = 0
                if prev_pos_gap2 == 0:
                    gaplen2 = 0
                    
                if seq1[pos] == '-':
                    if seq2[pos] != '-':
                        nt_pos2 += 1
                        gaplen1 += 1
                        prev_pos_gap1 = 1
                        #write 2
                        if prev_pos_gap2 == 1:
                            prev_pos_gap2 = 0
                            print >> fout1, "%d\t%s\t%s\t%s\t%s\t%s\t%s" % ( block_ind+1, src2, nt_pos1, nt_pos1+1, nt_pos2-1, nt_pos2-1+gaplen2, gaplen2 )
                        if pos == len(seq1)-1:
                            print >> fout1, "%d\t%s\t%s\t%s\t%s\t%s\t%s" % ( block_ind+1, src1, nt_pos1, nt_pos1+1, nt_pos2+1-gaplen1, nt_pos2+1, gaplen1 )
                    else:
                        prev_pos_gap1 = 0
                        prev_pos_gap2 = 0
                        """
                        if prev_pos_gap1 == 1:
                            prev_pos_gap1 = 0
                            print >> fout1, "%d\t%s\t%s\t%s\t%s" % ( block_ind+1, src1, nt_pos1-1, nt_pos1, gaplen1 )
                        elif prev_pos_gap2 == 1:
                            prev_pos_gap2 = 0
                            print >> fout1, "%d\t%s\t%s\t%s\t%s" % ( block_ind+1, src2, nt_pos2-1, nt_pos2, gaplen2 )
                        """
                else:
                    nt_pos1 += 1
                    if seq2[pos] != '-':
                        nt_pos2 += 1
                        #write both
                        if prev_pos_gap1 == 1:
                            prev_pos_gap1 = 0
                            print >> fout1, "%d\t%s\t%s\t%s\t%s\t%s\t%s" % ( block_ind+1, src1, nt_pos1-1, nt_pos1, nt_pos2-gaplen1, nt_pos2, gaplen1 )
                        elif prev_pos_gap2 == 1:
                            prev_pos_gap2 = 0
                            print >> fout1, "%d\t%s\t%s\t%s\t%s\t%s\t%s" % ( block_ind+1, src2, nt_pos1-gaplen2, nt_pos1, nt_pos2-1, nt_pos2, gaplen2 )
                    else:
                        gaplen2 += 1
                        prev_pos_gap2 = 1
                        #write 1
                        if prev_pos_gap1 == 1:
                            prev_pos_gap1 = 0
                            print >> fout1, "%d\t%s\t%s\t%s\t%s\t%s\t%s" % ( block_ind+1, src1, nt_pos1-1, nt_pos1, nt_pos2, nt_pos2+gaplen1, gaplen1 )
                        if pos == len(seq1)-1:
                            print >> fout1, "%d\t%s\t%s\t%s\t%s\t%s\t%s" % ( block_ind+1, src2, nt_pos1+1-gaplen2, nt_pos1+1, nt_pos2, nt_pos2+1, gaplen2 )
                pos += 1


if __name__ == "__main__":
    main()