Mercurial > repos > xuebing > bed_shuffle_chrom
comparison bed_shuffle_chrom.py @ 2:0a4959804274 default tip
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 14:00:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:11e3609fa73c | 2:0a4959804274 |
---|---|
1 ''' | |
2 simulate a random interval set that mimics the size and strand of a reference set | |
3 ''' | |
4 | |
5 def inferSizeFromRefBed(filename): | |
6 ''' | |
7 read reference interval set, get chrom size information | |
8 ''' | |
9 chrSize = {} | |
10 f = open(filename) | |
11 for line in f: | |
12 flds = line.strip().split('\t') | |
13 if not chrSize.has_key(flds[0]): | |
14 chrSize[flds[0]] = int(flds[2]) | |
15 elif chrSize[flds[0]] < int(flds[2]): | |
16 chrSize[flds[0]] = int(flds[2]) | |
17 f.close() | |
18 return chrSize | |
19 | |
20 def getChrSize(filename): | |
21 chrSize = {} | |
22 f = open(filename) | |
23 for line in f: | |
24 flds = line.strip().split() | |
25 if len(flds) >1: | |
26 chrSize[flds[0]] = int(flds[1]) | |
27 f.close() | |
28 return chrSize | |
29 | |
30 def makeWeightedChrom(chrSize): | |
31 ''' | |
32 make a list of chr_id, the freq is proportional to its length | |
33 ''' | |
34 | |
35 genome_len = 0 | |
36 | |
37 for chrom in chrSize: | |
38 chrom_len = chrSize[chrom] | |
39 genome_len += chrom_len | |
40 | |
41 weighted_chrom = [] | |
42 for chrom in chrSize: | |
43 weight = int(round(1000*float(chrSize[chrom])/genome_len)) | |
44 weighted_chrom += [chrom]*weight | |
45 | |
46 return weighted_chrom | |
47 | |
48 def randomIntervalWithinChrom(infile,outfile,chrSize): | |
49 ''' | |
50 ''' | |
51 fin = open(infile) | |
52 fout = open(outfile,'w') | |
53 n = 0 | |
54 for line in fin: | |
55 n = n + 1 | |
56 flds = line.strip().split('\t') | |
57 interval_size = int(flds[2]) - int(flds[1]) | |
58 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) | |
59 flds[2] = str(int(flds[1])+interval_size) | |
60 fout.write('\t'.join(flds)+'\n') | |
61 fin.close() | |
62 fout.close() | |
63 | |
64 def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom): | |
65 ''' | |
66 ''' | |
67 fin = open(infile) | |
68 fout = open(outfile,'w') | |
69 n = 0 | |
70 for line in fin: | |
71 n = n + 1 | |
72 flds = line.strip().split('\t') | |
73 interval_size = int(flds[2]) - int(flds[1]) | |
74 # find a random chrom | |
75 flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] | |
76 flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) | |
77 flds[2] = str(int(flds[1])+interval_size) | |
78 fout.write('\t'.join(flds)+'\n') | |
79 fin.close() | |
80 fout.close() | |
81 | |
82 import sys,random | |
83 def main(): | |
84 # python random_interval.py test100.bed testout.bed across human.hg18.genome | |
85 | |
86 reference_interval_file = sys.argv[1] | |
87 output_file = sys.argv[2] | |
88 across_or_within_chrom = sys.argv[3] # within or across | |
89 chrom_size_file = sys.argv[4] | |
90 chrSize = getChrSize(chrom_size_file) | |
91 print chrSize.keys() | |
92 if across_or_within_chrom == 'within': | |
93 randomIntervalWithinChrom(reference_interval_file,output_file,chrSize) | |
94 else: | |
95 randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize)) | |
96 main() |