# HG changeset patch # User xuebing # Date 1333216714 14400 # Node ID d445743c65bf359bac09a22b12299ca46d935ba8 Uploaded diff -r 000000000000 -r d445743c65bf bed_shuffle.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_shuffle.py Sat Mar 31 13:58:34 2012 -0400 @@ -0,0 +1,107 @@ +''' +simulate a random interval set that mimics the size and strand of a reference set +''' + +def inferSizeFromRefBed(filename,header): + ''' + read reference interval set, get chrom size information + ''' + chrSize = {} + f = open(filename) + if header: + header = f.readline() + for line in f: + flds = line.strip().split('\t') + if not chrSize.has_key(flds[0]): + chrSize[flds[0]] = int(flds[2]) + elif chrSize[flds[0]] < int(flds[2]): + chrSize[flds[0]] = int(flds[2]) + f.close() + return chrSize + +def getChrSize(filename): + chrSize = {} + f = open(filename) + for line in f: + flds = line.strip().split('\t') + if len(flds) >1: + chrSize[flds[0]] = int(flds[1]) + f.close() + return chrSize + +def makeWeightedChrom(chrSize): + ''' + make a list of chr_id, the freq is proportional to its length + ''' + + genome_len = 0 + + for chrom in chrSize: + chrom_len = chrSize[chrom] + genome_len += chrom_len + + weighted_chrom = [] + for chrom in chrSize: + weight = int(round(1000*float(chrSize[chrom])/genome_len)) + weighted_chrom += [chrom]*weight + + return weighted_chrom + +def randomIntervalWithinChrom(infile,outfile,chrSize,header): + ''' + ''' + fin = open(infile) + if header: + header = fin.readline() + fout = open(outfile,'w') + n = 0 + for line in fin: + n = n + 1 + flds = line.strip().split('\t') + interval_size = int(flds[2]) - int(flds[1]) + rstart = random.randint(0,chrSize[flds[0]]-interval_size) + fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') + fin.close() + fout.close() + +def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header): + ''' + ''' + fin = open(infile) + if header: + header = fin.readline() + fout = open(outfile,'w') + n = 0 + for line in fin: + n = n + 1 + flds = line.strip().split('\t') + interval_size = int(flds[2]) - int(flds[1]) + # find a random chrom + flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)] + # random start in the chrom + rstart = random.randint(0,chrSize[flds[0]]-interval_size) + fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n') + fin.close() + fout.close() + +import sys,random +def main(): + # python random_interval.py test100.bed testout.bed across header human.hg18.genome + + reference_interval_file = sys.argv[1] + output_file = sys.argv[2] + across_or_within_chrom = sys.argv[3] # within or across + if sys.argv[4] == 'header': + header = True + else: + header = False + if len(sys.argv) == 6: + chrom_size_file = sys.argv[5] + chrSize = getChrSize(chrom_size_file) + else: + chrSize = inferSizeFromRefBed(reference_interval_file,header) + if across_or_within_chrom == 'within': + randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header) + else: + randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header) +main()