Mercurial > repos > xuebing > bed_shuffle_chrom
view bed_shuffle_chrom.py @ 2:0a4959804274 default tip
Uploaded
author | xuebing |
---|---|
date | Sat, 31 Mar 2012 14:00:56 -0400 |
parents | |
children |
line wrap: on
line source
''' simulate a random interval set that mimics the size and strand of a reference set ''' def inferSizeFromRefBed(filename): ''' read reference interval set, get chrom size information ''' chrSize = {} f = open(filename) for line in f: flds = line.strip().split('\t') if not chrSize.has_key(flds[0]): chrSize[flds[0]] = int(flds[2]) elif chrSize[flds[0]] < int(flds[2]): chrSize[flds[0]] = int(flds[2]) f.close() return chrSize def getChrSize(filename): chrSize = {} f = open(filename) for line in f: flds = line.strip().split() if len(flds) >1: chrSize[flds[0]] = int(flds[1]) f.close() return chrSize def makeWeightedChrom(chrSize): ''' make a list of chr_id, the freq is proportional to its length ''' genome_len = 0 for chrom in chrSize: chrom_len = chrSize[chrom] genome_len += chrom_len weighted_chrom = [] for chrom in chrSize: weight = int(round(1000*float(chrSize[chrom])/genome_len)) weighted_chrom += [chrom]*weight return weighted_chrom def randomIntervalWithinChrom(infile,outfile,chrSize): ''' ''' fin = open(infile) fout = open(outfile,'w') n = 0 for line in fin: n = n + 1 flds = line.strip().split('\t') interval_size = int(flds[2]) - int(flds[1]) flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) flds[2] = str(int(flds[1])+interval_size) fout.write('\t'.join(flds)+'\n') fin.close() fout.close() def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom): ''' ''' fin = open(infile) fout = open(outfile,'w') n = 0 for line in fin: n = n + 1 flds = line.strip().split('\t') interval_size = int(flds[2]) - int(flds[1]) # find a random chrom flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] flds[1] = str(random.randint(0,chrSize[flds[0]]-interval_size)) flds[2] = str(int(flds[1])+interval_size) fout.write('\t'.join(flds)+'\n') fin.close() fout.close() import sys,random def main(): # python random_interval.py test100.bed testout.bed across human.hg18.genome reference_interval_file = sys.argv[1] output_file = sys.argv[2] across_or_within_chrom = sys.argv[3] # within or across chrom_size_file = sys.argv[4] chrSize = getChrSize(chrom_size_file) print chrSize.keys() if across_or_within_chrom == 'within': randomIntervalWithinChrom(reference_interval_file,output_file,chrSize) else: randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize)) main()