Mercurial > repos > xuebing > sampline
view sampline.py @ 1:fc8ab6f2276a default tip
Uploaded
author | xuebing |
---|---|
date | Thu, 08 Sep 2011 22:30:00 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ Sampling random records from a file. Each record is defined by a fixed number of lines. Usage: sampline.py [options] Options: -h, --help show this help message and exit -r, --replacement Sampling with replacement -i INPUT, --input=INPUT Input file -o OUTPUT, --output=OUTPUT Output file -k NSAMPLE, --nSample=NSAMPLE (required) number of records to be sampled/output -m RECSIZE, --recSize=RECSIZE (default=1) number of lines spanned by each record -n NSKIP, --nSkip=NSKIP (default=0) number of comment lines to skip at the beginning example: python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement """ import optparse, string, random,sys,math,itertools assert sys.version_info[:2] >= ( 2, 4 ) def main(): # Parse command line parser = optparse.OptionParser( usage="%prog [options] " ) parser.add_option( "-r", "--replacement", action="store_true", dest="replacement",default=False, help="Sampling with replacement" ) parser.add_option( "-i", "--input", dest="input", default=None, help="Input file" ) parser.add_option( "-o", "--output", dest="output", default=None, help="Output file" ) parser.add_option("-k","--nSample", type='int',dest="nSample",default=None, help="(required) number of records to be sampled/output" ) parser.add_option("-m","--recSize", type='int',dest="recSize",default=1, help="(default=1) number of lines spanned by each record" ) parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0, help="(default=0) number of comment lines to skip at the beginning") options, args = parser.parse_args() #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement) def sample_wr(population, k): "Chooses k random elements (with replacement) from a population" n = len(population) _random, _int = random.random, int # speed hack return [_int(_random() * n) for i in itertools.repeat(None, k)] # num of lines def readinput(filename): try: f = open (filename) except: print >> sys.stderr, "can't open file "+str(filename) sys.exit(0) nline = 0 for line in f: nline = nline + 1 f.close() return nline def sampline(infile,outfile,nSample,recSize,nSkip,replacement): # sample nSample records from file # each record contains recSize lines # skip the top nSkip lines nLine = readinput(infile) print 'num of lines in input: ',nLine print 'avoid sampling the first ',nSkip,' lines' print 'lines per record: ',recSize if (nLine-nSkip) % recSize: print >> sys.stderr, "the number of lines is not dividable by record size!" sys.exit(0) nTotalRecords = (nLine-nSkip) / recSize print "total number of records: ",nTotalRecords if replacement or nTotalRecords < nSample: sel = sample_wr(range(nTotalRecords),nSample) else: sel = random.sample(range(nTotalRecords),nSample) #print len(sel), sorted(sel) # output try: fout = open (outfile,'w') except: print >> sys.stderr, "can't open file "+str(outfile) sys.exit(0) fin = open(infile) n = 0 # index of line rec = "" # to store all content of a record nrepeat = 0 # number of times a record is sampled curr_rec = -1 for line in fin: if n < nSkip: n = n + 1 fout.write(line) continue if not (n-nSkip) % recSize:# a new record # print the previous sampled record for i in range(nrepeat): fout.write(rec) curr_rec = (n-nSkip)/recSize nrepeat = sel.count(curr_rec) if nrepeat: # sampled rec = line #print curr_rec,nrepeat elif (n-nSkip)/recSize == curr_rec: rec = rec + line n = n + 1 # if the last record is selected if curr_rec == nTotalRecords-1: for i in range(nrepeat): fout.write(rec) fin.close() fout.close() if __name__ == "__main__": main()