diff tools/regVariation/substitutions.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/substitutions.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+#Guruprasad ANanda
+"""
+Fetches substitutions from pairwise alignments.
+"""
+
+from galaxy import eggs
+
+from galaxy.tools.util import maf_utilities
+
+import bx.align.maf
+import sys
+import os, fileinput
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+if len(sys.argv) < 3:
+        stop_err("Incorrect number of arguments.")    
+    
+inp_file = sys.argv[1]
+out_file = sys.argv[2]
+fout = open(out_file, 'w')
+
+def fetchSubs(block):
+    
+    src1 = block.components[0].src
+    sequence1 = block.components[0].text
+    start1 = block.components[0].start
+    end1 = block.components[0].end
+    len1 = int(end1)-int(start1)
+    len1_withgap = len(sequence1)
+    
+    for seq in range (1,len(block.components)):
+        src2 = block.components[seq].src
+        sequence2 = block.components[seq].text
+        start2 = block.components[seq].start
+        end2 = block.components[seq].end
+        len2 = int(end2)-int(start2)
+        sub_begin = None
+        sub_end = None
+        begin = False
+        
+        for nt in range(len1_withgap):
+            if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
+                if sequence1[nt].upper() != sequence2[nt].upper():
+                    if not(begin):
+                        sub_begin = nt
+                        begin = True
+                    sub_end = nt
+                else:
+                    if begin:
+                        print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),start1+sub_end-sequence1[0:sub_end].count('-'))
+                        print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),start2+sub_end-sequence2[0:sub_end].count('-'))    
+                        begin = False
+
+            else:
+                if begin:
+                    print >>fout, "%s\t%s\t%s" %(src1,start1+sub_begin-sequence1[0:sub_begin].count('-'),end1+sub_end-sequence1[0:sub_end].count('-'))
+                    print >>fout, "%s\t%s\t%s" %(src2,start2+sub_begin-sequence2[0:sub_begin].count('-'),end2+sub_end-sequence2[0:sub_end].count('-'))    
+                    begin = False
+                    ended = False
+    
+              
+def main():
+    skipped = 0
+    not_pairwise = 0
+    try:
+        maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
+    except:
+        stop_err("Your MAF file appears to be malformed.")
+    print >>fout, "#Chr\tStart\tEnd"
+    for block in maf_reader:
+        if len(block.components) != 2:
+            not_pairwise += 1
+            continue
+        try:
+            fetchSubs(block)
+        except:
+            skipped += 1
+    
+    if not_pairwise:
+        print "Skipped %d non-pairwise blocks" %(not_pairwise)
+    if skipped:
+        print "Skipped %d blocks" %(skipped)
+if __name__ == "__main__":
+    main()