# HG changeset patch # User xuebing # Date 1315535400 14400 # Node ID fc8ab6f2276abeae110db861c1498469f31492d2 # Parent b7bc4f4399f05e891c2935da65dfe00e0bab89cd Uploaded diff -r b7bc4f4399f0 -r fc8ab6f2276a sampline.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sampline.py Thu Sep 08 22:30:00 2011 -0400 @@ -0,0 +1,133 @@ +#!/usr/bin/env python + +""" +Sampling random records from a file. Each record is defined by a fixed number of lines. + +Usage: sampline.py [options] + +Options: + -h, --help show this help message and exit + -r, --replacement Sampling with replacement + -i INPUT, --input=INPUT + Input file + -o OUTPUT, --output=OUTPUT + Output file + -k NSAMPLE, --nSample=NSAMPLE + (required) number of records to be sampled/output + -m RECSIZE, --recSize=RECSIZE + (default=1) number of lines spanned by each record + -n NSKIP, --nSkip=NSKIP + (default=0) number of comment lines to skip at the + beginning + +example: + python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement +""" + +import optparse, string, random,sys,math,itertools + +assert sys.version_info[:2] >= ( 2, 4 ) + +def main(): + + # Parse command line + parser = optparse.OptionParser( usage="%prog [options] " ) + parser.add_option( "-r", "--replacement", action="store_true", dest="replacement",default=False, + help="Sampling with replacement" ) + parser.add_option( "-i", "--input", dest="input", default=None, + help="Input file" ) + parser.add_option( "-o", "--output", dest="output", default=None, + help="Output file" ) + parser.add_option("-k","--nSample", type='int',dest="nSample",default=None, + help="(required) number of records to be sampled/output" ) + parser.add_option("-m","--recSize", type='int',dest="recSize",default=1, + help="(default=1) number of lines spanned by each record" ) + parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0, + help="(default=0) number of comment lines to skip at the beginning") + options, args = parser.parse_args() + #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" + + sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement) + +def sample_wr(population, k): + "Chooses k random elements (with replacement) from a population" + n = len(population) + _random, _int = random.random, int # speed hack + return [_int(_random() * n) for i in itertools.repeat(None, k)] + +# num of lines +def readinput(filename): + try: + f = open (filename) + except: + print >> sys.stderr, "can't open file "+str(filename) + sys.exit(0) + + nline = 0 + for line in f: + nline = nline + 1 + f.close() + return nline + +def sampline(infile,outfile,nSample,recSize,nSkip,replacement): + # sample nSample records from file + # each record contains recSize lines + # skip the top nSkip lines + + nLine = readinput(infile) + print 'num of lines in input: ',nLine + print 'avoid sampling the first ',nSkip,' lines' + print 'lines per record: ',recSize + + if (nLine-nSkip) % recSize: + print >> sys.stderr, "the number of lines is not dividable by record size!" + sys.exit(0) + + nTotalRecords = (nLine-nSkip) / recSize + print "total number of records: ",nTotalRecords + + if replacement or nTotalRecords < nSample: + sel = sample_wr(range(nTotalRecords),nSample) + else: + sel = random.sample(range(nTotalRecords),nSample) + + #print len(sel), sorted(sel) + + # output + try: + fout = open (outfile,'w') + except: + print >> sys.stderr, "can't open file "+str(outfile) + sys.exit(0) + fin = open(infile) + n = 0 # index of line + rec = "" # to store all content of a record + nrepeat = 0 # number of times a record is sampled + curr_rec = -1 + for line in fin: + if n < nSkip: + n = n + 1 + fout.write(line) + continue + + if not (n-nSkip) % recSize:# a new record + # print the previous sampled record + for i in range(nrepeat): + fout.write(rec) + curr_rec = (n-nSkip)/recSize + nrepeat = sel.count(curr_rec) + if nrepeat: # sampled + rec = line + #print curr_rec,nrepeat + elif (n-nSkip)/recSize == curr_rec: + rec = rec + line + n = n + 1 + # if the last record is selected + if curr_rec == nTotalRecords-1: + for i in range(nrepeat): + fout.write(rec) + fin.close() + fout.close() + + +if __name__ == "__main__": main()