view kmersvm/scripts/nullseq_generate.py @ 7:fd740d515502 draft default tip

Uploaded revised kmer-SVM to include modules from kmer-visual.
author cafletezbrant
date Sun, 16 Jun 2013 18:06:14 -0400
parents 7fe1103032f7
children
line wrap: on
line source

import os
import os.path
import sys
import random
import optparse

from bitarray import bitarray

def read_bed_file(filename):
	"""
	"""
	f = open(filename, 'r')
	lines = f.readlines()
	f.close()

	positions = []

	for line in lines:
		if line[0] == '#': 
			continue

		l = line.split()

		positions.append((l[0], int(l[1]), int(l[2])))

	return positions


def bitarray_fromfile(filename):
	"""
	"""
	fh = open(filename, 'rb')
	bits = bitarray()
	bits.fromfile(fh)

	return bits, fh


def make_profile(positions, buildname, basedir):
	"""
	"""
	chrnames = sorted(set(map(lambda p: p[0], positions)))

	profiles = []
	for chrom in chrnames:
		idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
		idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))

		#if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False:
		#	continue
		bits_gc, gcf = bitarray_fromfile(idxf_gc)
		bits_rpt, rptf = bitarray_fromfile(idxf_rpt)

		for pos in positions:
			if pos[0] != chrom:
				continue

			seqid = pos[0] + ':' + str(pos[1]+1) + '-' + str(pos[2])
			slen = pos[2]-pos[1]
			gc = bits_gc[pos[1]:pos[2]].count(True)
			rpt = bits_rpt[pos[1]:pos[2]].count(True)

			profiles.append((seqid, slen, gc, rpt))

		gcf.close()
		rptf.close()

	return profiles


def sample_sequences(positions, buildname, basedir, options):
	"""
	"""
	max_fails = 20
	max_trys = options.max_trys
	norpt = options.norpt
	nogc = options.nogc

	chrnames = sorted(set(map(lambda p: p[0], positions)))
	profiles = make_profile(positions, buildname, basedir)

	excluded = []
	if options.excludefile:
		excluded = read_bed_file(options.excludefile)

	#truncate (clear) file
	f = open(options.output, 'w')
	f.close()
	
	for chrom in chrnames:
		if options.quiet == False:
			sys.stderr.write("sampling from " + chrom + "\n")

		idxf_na = os.path.join(basedir, '.'.join([buildname, chrom, 'na', 'out']))
		idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
		idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))

		#this bit array is used to mark positions that are excluded from sampling
		#this will be updated as we sample more sequences in order to prevent sampled sequences from overlapping
		bits_na, naf = bitarray_fromfile(idxf_na)
		bits_gc, gcf = bitarray_fromfile(idxf_gc)
		bits_rpt, rptf = bitarray_fromfile(idxf_rpt)

		#mark excluded regions
		for pos in excluded:
			if pos[0] != chrom:
				continue
			bits_na[pos[1]:pos[2]] = True

		npos = 0
		#mark positive regions
		for pos in positions:
			if pos[0] != chrom:
				continue
			bits_na[pos[1]:pos[2]] = True
			npos+=1

		if options.count == 0:
			count = options.fold*npos
		else:
			count = options.count

		#initialize paramter
		#added by dlee 2/17/13	
		ncfails = 0
		rpt_err = options.rpt_err
		gc_err = options.gc_err

		sampled_positions = []
		while len(sampled_positions) < count:
			sampled_prof = random.choice(profiles)
			sampled_len = sampled_prof[1]
			sampled_gc = sampled_prof[2]
			sampled_rpt = sampled_prof[3]

			#relax rpt_err and gc_err if it keep fail to sample a region
			#added by dlee 2/17/13	
			if ncfails >= max_fails:
				if options.quiet == False:
					sys.stderr.write("reached max_fail. relax gc and rpt err criteria\n")
				ncfails = 0
				rpt_err += 0.01
				gc_err += 0.01

			rpt_err_allowed = int(rpt_err*sampled_len)
			gc_err_allowed = int(gc_err*sampled_len)
			trys = 0
			while trys < max_trys:
				trys += 1

				pos = random.randint(1, bits_na.length() - sampled_len)
				pos_e = pos+sampled_len

				if bits_na[pos:pos_e].any():
					continue

				if not norpt:
					pos_rpt = bits_rpt[pos:pos_e].count(True)
					if abs(sampled_rpt - pos_rpt) > rpt_err_allowed:
						continue

				if not nogc:
					pos_gc = bits_gc[pos:pos_e].count(True)
					if abs(sampled_gc - pos_gc) > gc_err_allowed:
						continue

				#accept the sampled position
				#mark the sampled regions
				bits_na[pos:pos_e] = True

				sampled_positions.append((chrom, pos, pos_e))

				#reset the counter of consecutive fails
				#added by dlee 2/17/13	
				ncfails = 0

				#print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc
				break
			else:
				#increase the counter of consecutive fails
				#added by dlee 2/17/13	
				ncfails += 1

				if options.quiet == False:
					sys.stderr.write(' '.join(["fail to sample from", \
							"len=", str(sampled_len), \
							"rpt=", str(sampled_rpt), \
							"gc=", str(sampled_gc)]) + '\n')

		naf.close()
		gcf.close()
		rptf.close()

		f = open(options.output, 'a')
		for spos in sorted(sampled_positions, key=lambda s: s[1]):
			f.write('\t'.join([spos[0], str(spos[1]), str(spos[2])]) + '\n')
		f.close()


def main(argv=sys.argv):
	usage = "usage: %prog [options] <Input Bed File> <Genome Build Name> <Base Directory>"

	desc  = "generate null sequences"
	parser = optparse.OptionParser(usage=usage, description=desc)

	parser.add_option("-x", dest="fold", type="int", \
		default = 1, help="number of sequence to sample, FOLD times of given dataset (default=1)")

	parser.add_option("-c", dest="count", type="int", \
		default=0, help="number of sequences to sample, override -x option (default=NA, obsolete)")

	parser.add_option("-r", dest="rseed", type="int", \
		default=1, help="random number seed (default=1)")

	parser.add_option("-g", dest="gc_err", type="float", \
		default=0.02, help="GC errors allowed (default=0.02)")

	parser.add_option("-t", dest="rpt_err", type="float", \
		default=0.02, help="RPT errors allowed (default=0.02)")

	parser.add_option("-e", dest="excludefile", \
		default="", help="filename that contains regions to be excluded (default=NA)")

	parser.add_option("-G", dest="nogc", action="store_true", \
		default=False, help="do not match gc-contents")

	parser.add_option("-R", dest="norpt", action="store_true", \
		default=False, help="do not match repeats")

	parser.add_option("-m", dest="max_trys", type="int", \
		default=10000, help="number of maximum trys to sample of one sequence (default=10000)")

	parser.add_option("-o", dest="output", default="nullseq_output.bed", \
			help="set the name of output file (default=nullseq_output.bed)")

	parser.add_option("-q", dest="quiet", default=False, action="store_true", \
  			help="supress messages (default=false)")

	(options, args) = parser.parse_args()

	if len(args) == 0:
		parser.print_help()
		sys.exit(0)

	if len(args) != 3:
		parser.error("incorrect number of arguments")
		parser.print_help()
		sys.exit(0)


	posfile = args[0]
	buildname = args[1]
	basedir = args[2]

	random.seed(options.rseed)

	if options.quiet == False:
		sys.stderr.write('Options:\n')
		sys.stderr.write('  N fold: ' + str(options.fold) + '\n')
		sys.stderr.write('  GC match: ' + str(not options.nogc) + '\n')
		sys.stderr.write('  repeat match: ' + str(not options.norpt) + '\n')
		sys.stderr.write('  GC error allowed: ' + str(options.gc_err) + '\n')
		sys.stderr.write('  repeat error allowed: ' + str(options.rpt_err) + '\n')
		sys.stderr.write('  random seed: ' + str(options.rseed) + '\n')
		sys.stderr.write('  max trys: ' + str(options.max_trys) + '\n')
		sys.stderr.write('  excluded regions: ' + options.excludefile+ '\n')
		sys.stderr.write('  output: ' + options.output + '\n')
		sys.stderr.write('\n')

		sys.stderr.write('Input args:\n')
		sys.stderr.write('  input bed file: ' + posfile+ '\n')
		sys.stderr.write('  genome build name: ' + buildname+ '\n')
		sys.stderr.write('  basedir: ' + basedir + '\n')
		sys.stderr.write('\n')

	positions = read_bed_file(posfile)

	sample_sequences(positions, buildname, basedir, options)
		
if __name__ == "__main__": main()