diff bedClean.py @ 11:b7f1d9f8f3bc

Uploaded
author xuebing
date Sat, 10 Mar 2012 07:59:27 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bedClean.py	Sat Mar 10 07:59:27 2012 -0500
@@ -0,0 +1,37 @@
+import sys
+
+def readChrSize(filename):
+    f = open(filename)
+    chrSize = {}
+    for line in f:
+        chrom,size = line.strip().split()
+        chrSize[chrom]=int(size)
+    f.close()
+    return chrSize
+
+def cleanFile(filename,chrSize,outfile):
+    f = open(filename)
+    out = open(outfile,'w')
+    i = 0
+    for line in f:
+        i = i + 1
+        flds = line.strip().split('\t')
+        if len(flds) < 3:
+            print 'line',i,'incomplete line:\n',line
+        elif chrSize.has_key(flds[0]):
+            if int(flds[1]) > int(flds[2]):
+                tmp = flds[1]
+                flds[1] = flds[2]
+                flds[2] = tmp
+            if int( flds[1]) < 0 or int(flds[2]) <0:
+                print 'line',i,'negative coordinates:\n',line
+            elif int(flds[2]) > chrSize[flds[0]]:
+                print 'line',i,'end larger than chr size:\n',line
+            else:
+                out.write('\t'.join(flds)+'\n')
+        else:
+            print 'line',i,'chromosome',flds[0],'not found!\n',line
+    f.close()
+    out.close()
+
+cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])