view tools/regVariation/getIndels.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python

"""
Estimate INDELs for pair-wise alignments.

usage: %prog maf_input out_file1 out_file2
"""

from __future__ import division
from galaxy import eggs
import pkg_resources 
pkg_resources.require( "bx-python" )
try:
    pkg_resources.require("numpy")
except:
    pass
import psyco_full
import sys
from bx.cookbook import doc_optparse
from galaxy.tools.exception_handling import *
import bx.align.maf

assert sys.version_info[:2] >= ( 2, 4 )

def main():   
    # Parsing Command Line here
    options, args = doc_optparse.parse( __doc__ )
    
    try:
        inp_file, out_file1 = args    
    except:
        print >> sys.stderr, "Tool initialization error."
        sys.exit()
    
    try:
        fin = open(inp_file,'r')
    except:
        print >> sys.stderr, "Unable to open input file"
        sys.exit()
    try:
        fout1 = open(out_file1,'w')
        #fout2 = open(out_file2,'w')
    except:
        print >> sys.stderr, "Unable to open output file"
        sys.exit()

    try:
        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
    except:
        print >>sys.stderr, "Your MAF file appears to be malformed."
        sys.exit()
    maf_count = 0
    
    print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
    for block_ind, block in enumerate(maf_reader):
        if len(block.components) < 2:
            continue
        seq1 = block.components[0].text
        src1 = block.components[0].src
        start1 = block.components[0].start
        if len(block.components) == 2:
            seq2 = block.components[1].text
            src2 = block.components[1].src
            start2 = block.components[1].start
            #for pos in range(len(seq1)):
            nt_pos1 = start1-1    #position of the nucleotide (without counting gaps)
            nt_pos2 = start2-1
            pos = 0        #character column position
            gaplen1 = 0
            gaplen2 = 0
            prev_pos_gap1 = 0
            prev_pos_gap2 = 0
            while pos < len(seq1):
                if prev_pos_gap1 == 0:
                    gaplen1 = 0
                if prev_pos_gap2 == 0:
                    gaplen2 = 0
                    
                if seq1[pos] == '-':
                    if seq2[pos] != '-':
                        nt_pos2 += 1
                        gaplen1 += 1
                        prev_pos_gap1 = 1
                        #write 2
                        if prev_pos_gap2 == 1:
                            prev_pos_gap2 = 0
                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
                        if pos == len(seq1)-1:
                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
                    else:
                        prev_pos_gap1 = 0
                        prev_pos_gap2 = 0
                        """
                        if prev_pos_gap1 == 1:
                            prev_pos_gap1 = 0
                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
                        elif prev_pos_gap2 == 1:
                            prev_pos_gap2 = 0
                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
                        """
                else:
                    nt_pos1 += 1
                    if seq2[pos] != '-':
                        nt_pos2 += 1
                        #write both
                        if prev_pos_gap1 == 1:
                            prev_pos_gap1 = 0
                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
                        elif prev_pos_gap2 == 1:
                            prev_pos_gap2 = 0
                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
                    else:
                        gaplen2 += 1
                        prev_pos_gap2 = 1
                        #write 1
                        if prev_pos_gap1 == 1:
                            prev_pos_gap1 = 0
                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
                        if pos == len(seq1)-1:
                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
                pos += 1
if __name__ == "__main__":
    main()