# HG changeset patch # User xuebing # Date 1333213434 14400 # Node ID 8fcf33cf58be55ea87270050a4fd34a3294dce25 # Parent 0adaf63c58d5362754a1f7ff1bd3d7e67f308089 Uploaded diff -r 0adaf63c58d5 -r 8fcf33cf58be bedClean.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bedClean.py Sat Mar 31 13:03:54 2012 -0400 @@ -0,0 +1,43 @@ +import sys + +def readChrSize(filename): + f = open(filename) + chrSize = {} + for line in f: + chrom,size = line.strip().split() + chrSize[chrom]=int(size) + f.close() + return chrSize + +def cleanFile(filename,chrSize,outfile): + f = open(filename) + out = open(outfile,'w') + i = 0 + for line in f: + i = i + 1 + flds = line.strip().split('\t') + if len(flds) < 3: + print 'line',i,'incomplete line:\n',line + elif chrSize.has_key(flds[0]): + if int(flds[1]) > int(flds[2]): + tmp = flds[1] + flds[1] = flds[2] + flds[2] = tmp + if int( flds[1]) < 0 or int(flds[2]) <0: + print 'line',i,'negative coordinates:\n',line + elif int(flds[2]) > chrSize[flds[0]]: + print 'line',i,'end larger than chr size:\n',line + else: + if flds[5] == '*': + flds[5] = '+' + print 'line',i,' strand * changed to +\n', line + out.write('\t'.join(flds)+'\n') + else: + print 'line',i,'chromosome',flds[0],'not found!\n',line + f.close() + out.close() + +if len(sys.argv) < 4: + print "python bedClean.py in.bed chrsizefile out.bed" + exit() +cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])