view shuffleBed.py @ 11:b7f1d9f8f3bc

Uploaded
author xuebing
date Sat, 10 Mar 2012 07:59:27 -0500
parents
children
line wrap: on
line source

'''
simulate a random interval set that mimics the size and strand of a reference set 
'''

def inferSizeFromRefBed(filename,header):
    '''
    read reference interval set, get chrom size information
    '''
    chrSize = {}
    f = open(filename)
    if header:
        header = f.readline()
    for line in f:
        flds = line.strip().split('\t')
        if not chrSize.has_key(flds[0]):
            chrSize[flds[0]] = int(flds[2])
        elif chrSize[flds[0]] < int(flds[2]):
            chrSize[flds[0]] = int(flds[2])
    f.close()
    return chrSize 

def getChrSize(filename):
    chrSize = {}
    f = open(filename)
    for line in f:
        flds = line.strip().split('\t')
        if len(flds) >1:
            chrSize[flds[0]] = int(flds[1])
    f.close()
    return chrSize
    
def makeWeightedChrom(chrSize):
    '''
    make a list of chr_id, the freq is proportional to its length
    '''
     
    genome_len = 0
    
    for chrom in chrSize:
        chrom_len = chrSize[chrom]
        genome_len += chrom_len

    weighted_chrom = []
    for chrom in chrSize:
        weight = int(round(1000*float(chrSize[chrom])/genome_len))
        weighted_chrom += [chrom]*weight

    return weighted_chrom            

def randomIntervalWithinChrom(infile,outfile,chrSize,header):
    '''
    '''
    fin = open(infile)
    if header:
        header = fin.readline()
    fout = open(outfile,'w')
    n = 0
    for line in fin:
        n = n + 1
        flds = line.strip().split('\t')
        interval_size = int(flds[2]) - int(flds[1])
        rstart = random.randint(0,chrSize[flds[0]]-interval_size)
        fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
    fin.close()
    fout.close()   

def randomIntervalAcrossChrom(infile,outfile,chrSize,weighted_chrom,header):
    '''
    '''
    fin = open(infile)
    if header:
        header = fin.readline()
    fout = open(outfile,'w')
    n = 0
    for line in fin:
        n = n + 1
        flds = line.strip().split('\t')
        interval_size = int(flds[2]) - int(flds[1])
        # find a random chrom
        flds[0] = weighted_chrom[random.randint(0, len(weighted_chrom) - 1)]
        # random start in the chrom
        rstart = random.randint(0,chrSize[flds[0]]-interval_size)
        fout.write(flds[0]+'\t'+str(rstart)+'\t'+str(rstart+interval_size)+'\t'+str(n)+'\t0\t+\n')
    fin.close()
    fout.close()            

import sys,random
def main():
    # python random_interval.py test100.bed testout.bed across header human.hg18.genome 

    reference_interval_file = sys.argv[1]
    output_file = sys.argv[2]
    across_or_within_chrom = sys.argv[3] # within or across 
    if sys.argv[4] == 'header':
        header = True 
    else:
        header = False
    if len(sys.argv) == 6:
        chrom_size_file = sys.argv[5]
        chrSize = getChrSize(chrom_size_file)
    else:
        chrSize = inferSizeFromRefBed(reference_interval_file,header) 
    if across_or_within_chrom == 'within':            
        randomIntervalWithinChrom(reference_interval_file,output_file,chrSize,header)
    else:
        randomIntervalAcrossChrom(reference_interval_file,output_file,chrSize,makeWeightedChrom(chrSize),header)   
main()