annotate bed_shuffle.py @ 0:d445743c65bf

Uploaded
author xuebing
date Sat, 31 Mar 2012 13:58:34 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d445743c65bf Uploaded
xuebing
parents:
diff changeset
1 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
2 simulate a random interval set that mimics the size and strand of a reference set
d445743c65bf Uploaded
xuebing
parents:
diff changeset
3 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
4
d445743c65bf Uploaded
xuebing
parents:
diff changeset
5 def inferSizeFromRefBed(filename,header):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
6 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
7 read reference interval set, get chrom size information
d445743c65bf Uploaded
xuebing
parents:
diff changeset
8 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
9 chrSize = {}
d445743c65bf Uploaded
xuebing
parents:
diff changeset
10 f = open(filename)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
11 if header:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
12 header = f.readline()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
13 for line in f:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
14 flds = line.strip().split('\t')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
15 if not chrSize.has_key(flds[0]):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
16 chrSize[flds[0]] = int(flds[2])
d445743c65bf Uploaded
xuebing
parents:
diff changeset
17 elif chrSize[flds[0]] < int(flds[2]):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
18 chrSize[flds[0]] = int(flds[2])
d445743c65bf Uploaded
xuebing
parents:
diff changeset
19 f.close()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
20 return chrSize
d445743c65bf Uploaded
xuebing
parents:
diff changeset
21
d445743c65bf Uploaded
xuebing
parents:
diff changeset
22 def getChrSize(filename):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
23 chrSize = {}
d445743c65bf Uploaded
xuebing
parents:
diff changeset
24 f = open(filename)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
25 for line in f:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
26 flds = line.strip().split('\t')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
27 if len(flds) >1:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
28 chrSize[flds[0]] = int(flds[1])
d445743c65bf Uploaded
xuebing
parents:
diff changeset
29 f.close()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
30 return chrSize
d445743c65bf Uploaded
xuebing
parents:
diff changeset
31
d445743c65bf Uploaded
xuebing
parents:
diff changeset
32 def makeWeightedChrom(chrSize):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
33 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
34 make a list of chr_id, the freq is proportional to its length
d445743c65bf Uploaded
xuebing
parents:
diff changeset
35 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
36
d445743c65bf Uploaded
xuebing
parents:
diff changeset
37 genome_len = 0
d445743c65bf Uploaded
xuebing
parents:
diff changeset
38
d445743c65bf Uploaded
xuebing
parents:
diff changeset
39 for chrom in chrSize:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
40 chrom_len = chrSize[chrom]
d445743c65bf Uploaded
xuebing
parents:
diff changeset
41 genome_len += chrom_len
d445743c65bf Uploaded
xuebing
parents:
diff changeset
42
d445743c65bf Uploaded
xuebing
parents:
diff changeset
43 weighted_chrom = []
d445743c65bf Uploaded
xuebing
parents:
diff changeset
44 for chrom in chrSize:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
45 weight = int(round(1000*float(chrSize[chrom])/genome_len))
d445743c65bf Uploaded
xuebing
parents:
diff changeset
46 weighted_chrom += [chrom]*weight
d445743c65bf Uploaded
xuebing
parents:
diff changeset
47
d445743c65bf Uploaded
xuebing
parents:
diff changeset
48 return weighted_chrom
d445743c65bf Uploaded
xuebing
parents:
diff changeset
49
d445743c65bf Uploaded
xuebing
parents:
diff changeset
50 def randomIntervalWithinChrom(infile,outfile,chrSize,header):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
51 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
52 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
53 fin = open(infile)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
54 if header:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
55 header = fin.readline()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
56 fout = open(outfile,'w')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
57 n = 0
d445743c65bf Uploaded
xuebing
parents:
diff changeset
58 for line in fin:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
59 n = n + 1
d445743c65bf Uploaded
xuebing
parents:
diff changeset
60 flds = line.strip().split('\t')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
61 interval_size = int(flds[2]) - int(flds[1])
d445743c65bf Uploaded
xuebing
parents:
diff changeset
62 rstart = random.randint(0,chrSize[flds[0]]-interval_size)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
63 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
64 fin.close()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
65 fout.close()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
66
d445743c65bf Uploaded
xuebing
parents:
diff changeset
67 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
d445743c65bf Uploaded
xuebing
parents:
diff changeset
68 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
69 '''
d445743c65bf Uploaded
xuebing
parents:
diff changeset
70 fin = open(infile)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
71 if header:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
72 header = fin.readline()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
73 fout = open(outfile,'w')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
74 n = 0
d445743c65bf Uploaded
xuebing
parents:
diff changeset
75 for line in fin:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
76 n = n + 1
d445743c65bf Uploaded
xuebing
parents:
diff changeset
77 flds = line.strip().split('\t')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
78 interval_size = int(flds[2]) - int(flds[1])
d445743c65bf Uploaded
xuebing
parents:
diff changeset
79 # find a random chrom
d445743c65bf Uploaded
xuebing
parents:
diff changeset
80 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
d445743c65bf Uploaded
xuebing
parents:
diff changeset
81 # random start in the chrom
d445743c65bf Uploaded
xuebing
parents:
diff changeset
82 rstart = random.randint(0,chrSize[flds[0]]-interval_size)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
83 fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
d445743c65bf Uploaded
xuebing
parents:
diff changeset
84 fin.close()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
85 fout.close()
d445743c65bf Uploaded
xuebing
parents:
diff changeset
86
d445743c65bf Uploaded
xuebing
parents:
diff changeset
87 import sys,random
d445743c65bf Uploaded
xuebing
parents:
diff changeset
88 def main():
d445743c65bf Uploaded
xuebing
parents:
diff changeset
89 # python random_interval.py test100.bed testout.bed across header human.hg18.genome
d445743c65bf Uploaded
xuebing
parents:
diff changeset
90
d445743c65bf Uploaded
xuebing
parents:
diff changeset
91 reference_interval_file = sys.argv[1]
d445743c65bf Uploaded
xuebing
parents:
diff changeset
92 output_file = sys.argv[2]
d445743c65bf Uploaded
xuebing
parents:
diff changeset
93 across_or_within_chrom = sys.argv[3] # within or across
d445743c65bf Uploaded
xuebing
parents:
diff changeset
94 if sys.argv[4] == 'header':
d445743c65bf Uploaded
xuebing
parents:
diff changeset
95 header = True
d445743c65bf Uploaded
xuebing
parents:
diff changeset
96 else:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
97 header = False
d445743c65bf Uploaded
xuebing
parents:
diff changeset
98 if len(sys.argv) == 6:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
99 chrom_size_file = sys.argv[5]
d445743c65bf Uploaded
xuebing
parents:
diff changeset
100 chrSize = getChrSize(chrom_size_file)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
101 else:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
102 chrSize = inferSizeFromRefBed(reference_interval_file,header)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
103 if across_or_within_chrom == 'within':
d445743c65bf Uploaded
xuebing
parents:
diff changeset
104 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
105 else:
d445743c65bf Uploaded
xuebing
parents:
diff changeset
106 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)
d445743c65bf Uploaded
xuebing
parents:
diff changeset
107 main()