annotate bed_shuffle_chrom.py @ 2:0a4959804274 default tip

Uploaded
author xuebing
date Sat, 31 Mar 2012 14:00:56 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
0a4959804274 Uploaded
xuebing
parents:
diff changeset
1 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
2 simulate a random interval set that mimics the size and strand of a reference set
0a4959804274 Uploaded
xuebing
parents:
diff changeset
3 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
4
0a4959804274 Uploaded
xuebing
parents:
diff changeset
5 def inferSizeFromRefBed(filename):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
6 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
7 read reference interval set, get chrom size information
0a4959804274 Uploaded
xuebing
parents:
diff changeset
8 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
9 chrSize = {}
0a4959804274 Uploaded
xuebing
parents:
diff changeset
10 f = open(filename)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
11 for line in f:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
12 flds = line.strip().split('\t')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
13 if not chrSize.has_key(flds[0]):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
14 chrSize[flds[0]] = int(flds[2])
0a4959804274 Uploaded
xuebing
parents:
diff changeset
15 elif chrSize[flds[0]] < int(flds[2]):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
16 chrSize[flds[0]] = int(flds[2])
0a4959804274 Uploaded
xuebing
parents:
diff changeset
17 f.close()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
18 return chrSize
0a4959804274 Uploaded
xuebing
parents:
diff changeset
19
0a4959804274 Uploaded
xuebing
parents:
diff changeset
20 def getChrSize(filename):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
21 chrSize = {}
0a4959804274 Uploaded
xuebing
parents:
diff changeset
22 f = open(filename)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
23 for line in f:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
24 flds = line.strip().split()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
25 if len(flds) >1:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
26 chrSize[flds[0]] = int(flds[1])
0a4959804274 Uploaded
xuebing
parents:
diff changeset
27 f.close()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
28 return chrSize
0a4959804274 Uploaded
xuebing
parents:
diff changeset
29
0a4959804274 Uploaded
xuebing
parents:
diff changeset
30 def makeWeightedChrom(chrSize):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
31 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
32 make a list of chr_id, the freq is proportional to its length
0a4959804274 Uploaded
xuebing
parents:
diff changeset
33 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
34
0a4959804274 Uploaded
xuebing
parents:
diff changeset
35 genome_len = 0
0a4959804274 Uploaded
xuebing
parents:
diff changeset
36
0a4959804274 Uploaded
xuebing
parents:
diff changeset
37 for chrom in chrSize:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
38 chrom_len = chrSize[chrom]
0a4959804274 Uploaded
xuebing
parents:
diff changeset
39 genome_len += chrom_len
0a4959804274 Uploaded
xuebing
parents:
diff changeset
40
0a4959804274 Uploaded
xuebing
parents:
diff changeset
41 weighted_chrom = []
0a4959804274 Uploaded
xuebing
parents:
diff changeset
42 for chrom in chrSize:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
43 weight = int(round(1000*float(chrSize[chrom])/genome_len))
0a4959804274 Uploaded
xuebing
parents:
diff changeset
44 weighted_chrom += [chrom]*weight
0a4959804274 Uploaded
xuebing
parents:
diff changeset
45
0a4959804274 Uploaded
xuebing
parents:
diff changeset
46 return weighted_chrom
0a4959804274 Uploaded
xuebing
parents:
diff changeset
47
0a4959804274 Uploaded
xuebing
parents:
diff changeset
48 def randomIntervalWithinChrom(infile,outfile,chrSize):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
49 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
50 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
51 fin = open(infile)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
52 fout = open(outfile,'w')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
53 n = 0
0a4959804274 Uploaded
xuebing
parents:
diff changeset
54 for line in fin:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
55 n = n + 1
0a4959804274 Uploaded
xuebing
parents:
diff changeset
56 flds = line.strip().split('\t')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
57 interval_size = int(flds[2]) - int(flds[1])
0a4959804274 Uploaded
xuebing
parents:
diff changeset
58 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
0a4959804274 Uploaded
xuebing
parents:
diff changeset
59 flds[2] = str(int(flds[1])+interval_size)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
60 fout.write('\t'.join(flds)+'\n')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
61 fin.close()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
62 fout.close()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
63
0a4959804274 Uploaded
xuebing
parents:
diff changeset
64 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom):
0a4959804274 Uploaded
xuebing
parents:
diff changeset
65 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
66 '''
0a4959804274 Uploaded
xuebing
parents:
diff changeset
67 fin = open(infile)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
68 fout = open(outfile,'w')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
69 n = 0
0a4959804274 Uploaded
xuebing
parents:
diff changeset
70 for line in fin:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
71 n = n + 1
0a4959804274 Uploaded
xuebing
parents:
diff changeset
72 flds = line.strip().split('\t')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
73 interval_size = int(flds[2]) - int(flds[1])
0a4959804274 Uploaded
xuebing
parents:
diff changeset
74 # find a random chrom
0a4959804274 Uploaded
xuebing
parents:
diff changeset
75 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
0a4959804274 Uploaded
xuebing
parents:
diff changeset
76 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
0a4959804274 Uploaded
xuebing
parents:
diff changeset
77 flds[2] = str(int(flds[1])+interval_size)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
78 fout.write('\t'.join(flds)+'\n')
0a4959804274 Uploaded
xuebing
parents:
diff changeset
79 fin.close()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
80 fout.close()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
81
0a4959804274 Uploaded
xuebing
parents:
diff changeset
82 import sys,random
0a4959804274 Uploaded
xuebing
parents:
diff changeset
83 def main():
0a4959804274 Uploaded
xuebing
parents:
diff changeset
84 # python random_interval.py test100.bed testout.bed across human.hg18.genome
0a4959804274 Uploaded
xuebing
parents:
diff changeset
85
0a4959804274 Uploaded
xuebing
parents:
diff changeset
86 reference_interval_file = sys.argv[1]
0a4959804274 Uploaded
xuebing
parents:
diff changeset
87 output_file = sys.argv[2]
0a4959804274 Uploaded
xuebing
parents:
diff changeset
88 across_or_within_chrom = sys.argv[3] # within or across
0a4959804274 Uploaded
xuebing
parents:
diff changeset
89 chrom_size_file = sys.argv[4]
0a4959804274 Uploaded
xuebing
parents:
diff changeset
90 chrSize = getChrSize(chrom_size_file)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
91 print chrSize.keys()
0a4959804274 Uploaded
xuebing
parents:
diff changeset
92 if across_or_within_chrom == 'within':
0a4959804274 Uploaded
xuebing
parents:
diff changeset
93 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize)
0a4959804274 Uploaded
xuebing
parents:
diff changeset
94 else:
0a4959804274 Uploaded
xuebing
parents:
diff changeset
95 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize))
0a4959804274 Uploaded
xuebing
parents:
diff changeset
96 main()