20
|
1 import sys
|
|
2
|
|
3 def readChrSize(filename):
|
|
4 f = open(filename)
|
|
5 chrSize = {}
|
|
6 for line in f:
|
|
7 chrom,size = line.strip().split()
|
|
8 chrSize[chrom]=int(size)
|
|
9 f.close()
|
|
10 return chrSize
|
|
11
|
|
12 def cleanFile(filename,chrSize,outfile):
|
|
13 f = open(filename)
|
|
14 out = open(outfile,'w')
|
|
15 i = 0
|
|
16 for line in f:
|
|
17 i = i + 1
|
|
18 flds = line.strip().split('\t')
|
|
19 if len(flds) < 3:
|
|
20 print 'line',i,'incomplete line:\n',line
|
|
21 elif chrSize.has_key(flds[0]):
|
|
22 if int(flds[1]) > int(flds[2]):
|
|
23 tmp = flds[1]
|
|
24 flds[1] = flds[2]
|
|
25 flds[2] = tmp
|
|
26 if int( flds[1]) < 0 or int(flds[2]) <0:
|
|
27 print 'line',i,'negative coordinates:\n',line
|
|
28 elif int(flds[2]) > chrSize[flds[0]]:
|
|
29 print 'line',i,'end larger than chr size:\n',line
|
|
30 else:
|
|
31 if flds[5] == '*':
|
|
32 flds[5] = '+'
|
|
33 print 'line',i,' strand * changed to +\n', line
|
|
34 out.write('\t'.join(flds)+'\n')
|
|
35 else:
|
|
36 print 'line',i,'chromosome',flds[0],'not found!\n',line
|
|
37 f.close()
|
|
38 out.close()
|
|
39
|
|
40 if len(sys.argv) < 4:
|
|
41 print "python bedClean.py in.bed chrsizefile out.bed"
|
|
42 exit()
|
|
43 cleanFile(sys.argv[1],readChrSize(sys.argv[2]),sys.argv[3])
|