comparison random_interval.py @ 20:16ba480adf96

Uploaded
author xuebing
date Sat, 31 Mar 2012 08:31:22 -0400
parents
children
comparison
equal deleted inserted replaced
19:d325683ec368 20:16ba480adf96
1 '''
2 simulate a random interval set that mimics the size and strand of a reference set
3 '''
4
5 def inferSizeFromRefBed(filename):
6 '''
7 read reference interval set, get chrom size information
8 '''
9 chrSize = {}
10 f = open(filename)
11 for line in f:
12 flds = line.strip().split('\t')
13 if not chrSize.has_key(flds[0]):
14 chrSize[flds[0]] = int(flds[2])
15 elif chrSize[flds[0]] < int(flds[2]):
16 chrSize[flds[0]] = int(flds[2])
17 f.close()
18 return chrSize
19
20 def getChrSize(filename):
21 chrSize = {}
22 f = open(filename)
23 for line in f:
24 flds = line.strip().split()
25 if len(flds) >1:
26 chrSize[flds[0]] = int(flds[1])
27 f.close()
28 return chrSize
29
30 def makeWeightedChrom(chrSize):
31 '''
32 make a list of chr_id, the freq is proportional to its length
33 '''
34
35 genome_len = 0
36
37 for chrom in chrSize:
38 chrom_len = chrSize[chrom]
39 genome_len += chrom_len
40
41 weighted_chrom = []
42 for chrom in chrSize:
43 weight = int(round(1000*float(chrSize[chrom])/genome_len))
44 weighted_chrom += [chrom]*weight
45
46 return weighted_chrom
47
48 def randomIntervalWithinChrom(infile,outfile,chrSize):
49 '''
50 '''
51 fin = open(infile)
52 fout = open(outfile,'w')
53 n = 0
54 for line in fin:
55 n = n + 1
56 flds = line.strip().split('\t')
57 interval_size = int(flds[2]) - int(flds[1])
58 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
59 flds[2] = str(int(flds[1])+interval_size)
60 fout.write('\t'.join(flds)+'\n')
61 fin.close()
62 fout.close()
63
64 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom):
65 '''
66 '''
67 fin = open(infile)
68 fout = open(outfile,'w')
69 n = 0
70 for line in fin:
71 n = n + 1
72 flds = line.strip().split('\t')
73 interval_size = int(flds[2]) - int(flds[1])
74 # find a random chrom
75 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
76 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size))
77 flds[2] = str(int(flds[1])+interval_size)
78 fout.write('\t'.join(flds)+'\n')
79 fin.close()
80 fout.close()
81
82 import sys,random
83 def main():
84 # python random_interval.py test100.bed testout.bed across human.hg18.genome
85
86 reference_interval_file = sys.argv[1]
87 output_file = sys.argv[2]
88 across_or_within_chrom = sys.argv[3] # within or across
89 chrom_size_file = sys.argv[4]
90 chrSize = getChrSize(chrom_size_file)
91 print chrSize.keys()
92 if across_or_within_chrom == 'within':
93 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize)
94 else:
95 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize))
96 main()