diff tools/regVariation/getIndels.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/getIndels.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+"""
+Estimate INDELs for pair-wise alignments.
+
+usage: %prog maf_input out_file1 out_file2
+"""
+
+from __future__ import division
+from galaxy import eggs
+import pkg_resources 
+pkg_resources.require( "bx-python" )
+try:
+    pkg_resources.require("numpy")
+except:
+    pass
+import psyco_full
+import sys
+from bx.cookbook import doc_optparse
+from galaxy.tools.exception_handling import *
+import bx.align.maf
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def main():   
+    # Parsing Command Line here
+    options, args = doc_optparse.parse( __doc__ )
+    
+    try:
+        inp_file, out_file1 = args    
+    except:
+        print >> sys.stderr, "Tool initialization error."
+        sys.exit()
+    
+    try:
+        fin = open(inp_file,'r')
+    except:
+        print >> sys.stderr, "Unable to open input file"
+        sys.exit()
+    try:
+        fout1 = open(out_file1,'w')
+        #fout2 = open(out_file2,'w')
+    except:
+        print >> sys.stderr, "Unable to open output file"
+        sys.exit()
+
+    try:
+        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+    except:
+        print >>sys.stderr, "Your MAF file appears to be malformed."
+        sys.exit()
+    maf_count = 0
+    
+    print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
+    for block_ind, block in enumerate(maf_reader):
+        if len(block.components) < 2:
+            continue
+        seq1 = block.components[0].text
+        src1 = block.components[0].src
+        start1 = block.components[0].start
+        if len(block.components) == 2:
+            seq2 = block.components[1].text
+            src2 = block.components[1].src
+            start2 = block.components[1].start
+            #for pos in range(len(seq1)):
+            nt_pos1 = start1-1    #position of the nucleotide (without counting gaps)
+            nt_pos2 = start2-1
+            pos = 0        #character column position
+            gaplen1 = 0
+            gaplen2 = 0
+            prev_pos_gap1 = 0
+            prev_pos_gap2 = 0
+            while pos < len(seq1):
+                if prev_pos_gap1 == 0:
+                    gaplen1 = 0
+                if prev_pos_gap2 == 0:
+                    gaplen2 = 0
+                    
+                if seq1[pos] == '-':
+                    if seq2[pos] != '-':
+                        nt_pos2 += 1
+                        gaplen1 += 1
+                        prev_pos_gap1 = 1
+                        #write 2
+                        if prev_pos_gap2 == 1:
+                            prev_pos_gap2 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
+                        if pos == len(seq1)-1:
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
+                    else:
+                        prev_pos_gap1 = 0
+                        prev_pos_gap2 = 0
+                        """
+                        if prev_pos_gap1 == 1:
+                            prev_pos_gap1 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
+                        elif prev_pos_gap2 == 1:
+                            prev_pos_gap2 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
+                        """
+                else:
+                    nt_pos1 += 1
+                    if seq2[pos] != '-':
+                        nt_pos2 += 1
+                        #write both
+                        if prev_pos_gap1 == 1:
+                            prev_pos_gap1 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
+                        elif prev_pos_gap2 == 1:
+                            prev_pos_gap2 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
+                    else:
+                        gaplen2 += 1
+                        prev_pos_gap2 = 1
+                        #write 1
+                        if prev_pos_gap1 == 1:
+                            prev_pos_gap1 = 0
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
+                        if pos == len(seq1)-1:
+                            print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
+                pos += 1
+if __name__ == "__main__":
+    main()