Mercurial > repos > xuebing > sampline
comparison sampline.py @ 1:fc8ab6f2276a default tip
Uploaded
| author | xuebing |
|---|---|
| date | Thu, 08 Sep 2011 22:30:00 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:b7bc4f4399f0 | 1:fc8ab6f2276a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 Sampling random records from a file. Each record is defined by a fixed number of lines. | |
| 5 | |
| 6 Usage: sampline.py [options] | |
| 7 | |
| 8 Options: | |
| 9 -h, --help show this help message and exit | |
| 10 -r, --replacement Sampling with replacement | |
| 11 -i INPUT, --input=INPUT | |
| 12 Input file | |
| 13 -o OUTPUT, --output=OUTPUT | |
| 14 Output file | |
| 15 -k NSAMPLE, --nSample=NSAMPLE | |
| 16 (required) number of records to be sampled/output | |
| 17 -m RECSIZE, --recSize=RECSIZE | |
| 18 (default=1) number of lines spanned by each record | |
| 19 -n NSKIP, --nSkip=NSKIP | |
| 20 (default=0) number of comment lines to skip at the | |
| 21 beginning | |
| 22 | |
| 23 example: | |
| 24 python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement | |
| 25 """ | |
| 26 | |
| 27 import optparse, string, random,sys,math,itertools | |
| 28 | |
| 29 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 30 | |
| 31 def main(): | |
| 32 | |
| 33 # Parse command line | |
| 34 parser = optparse.OptionParser( usage="%prog [options] " ) | |
| 35 parser.add_option( "-r", "--replacement", action="store_true", dest="replacement",default=False, | |
| 36 help="Sampling with replacement" ) | |
| 37 parser.add_option( "-i", "--input", dest="input", default=None, | |
| 38 help="Input file" ) | |
| 39 parser.add_option( "-o", "--output", dest="output", default=None, | |
| 40 help="Output file" ) | |
| 41 parser.add_option("-k","--nSample", type='int',dest="nSample",default=None, | |
| 42 help="(required) number of records to be sampled/output" ) | |
| 43 parser.add_option("-m","--recSize", type='int',dest="recSize",default=1, | |
| 44 help="(default=1) number of lines spanned by each record" ) | |
| 45 parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0, | |
| 46 help="(default=0) number of comment lines to skip at the beginning") | |
| 47 options, args = parser.parse_args() | |
| 48 #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" | |
| 49 | |
| 50 sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement) | |
| 51 | |
| 52 def sample_wr(population, k): | |
| 53 "Chooses k random elements (with replacement) from a population" | |
| 54 n = len(population) | |
| 55 _random, _int = random.random, int # speed hack | |
| 56 return [_int(_random() * n) for i in itertools.repeat(None, k)] | |
| 57 | |
| 58 # num of lines | |
| 59 def readinput(filename): | |
| 60 try: | |
| 61 f = open (filename) | |
| 62 except: | |
| 63 print >> sys.stderr, "can't open file "+str(filename) | |
| 64 sys.exit(0) | |
| 65 | |
| 66 nline = 0 | |
| 67 for line in f: | |
| 68 nline = nline + 1 | |
| 69 f.close() | |
| 70 return nline | |
| 71 | |
| 72 def sampline(infile,outfile,nSample,recSize,nSkip,replacement): | |
| 73 # sample nSample records from file | |
| 74 # each record contains recSize lines | |
| 75 # skip the top nSkip lines | |
| 76 | |
| 77 nLine = readinput(infile) | |
| 78 print 'num of lines in input: ',nLine | |
| 79 print 'avoid sampling the first ',nSkip,' lines' | |
| 80 print 'lines per record: ',recSize | |
| 81 | |
| 82 if (nLine-nSkip) % recSize: | |
| 83 print >> sys.stderr, "the number of lines is not dividable by record size!" | |
| 84 sys.exit(0) | |
| 85 | |
| 86 nTotalRecords = (nLine-nSkip) / recSize | |
| 87 print "total number of records: ",nTotalRecords | |
| 88 | |
| 89 if replacement or nTotalRecords < nSample: | |
| 90 sel = sample_wr(range(nTotalRecords),nSample) | |
| 91 else: | |
| 92 sel = random.sample(range(nTotalRecords),nSample) | |
| 93 | |
| 94 #print len(sel), sorted(sel) | |
| 95 | |
| 96 # output | |
| 97 try: | |
| 98 fout = open (outfile,'w') | |
| 99 except: | |
| 100 print >> sys.stderr, "can't open file "+str(outfile) | |
| 101 sys.exit(0) | |
| 102 fin = open(infile) | |
| 103 n = 0 # index of line | |
| 104 rec = "" # to store all content of a record | |
| 105 nrepeat = 0 # number of times a record is sampled | |
| 106 curr_rec = -1 | |
| 107 for line in fin: | |
| 108 if n < nSkip: | |
| 109 n = n + 1 | |
| 110 fout.write(line) | |
| 111 continue | |
| 112 | |
| 113 if not (n-nSkip) % recSize:# a new record | |
| 114 # print the previous sampled record | |
| 115 for i in range(nrepeat): | |
| 116 fout.write(rec) | |
| 117 curr_rec = (n-nSkip)/recSize | |
| 118 nrepeat = sel.count(curr_rec) | |
| 119 if nrepeat: # sampled | |
| 120 rec = line | |
| 121 #print curr_rec,nrepeat | |
| 122 elif (n-nSkip)/recSize == curr_rec: | |
| 123 rec = rec + line | |
| 124 n = n + 1 | |
| 125 # if the last record is selected | |
| 126 if curr_rec == nTotalRecords-1: | |
| 127 for i in range(nrepeat): | |
| 128 fout.write(rec) | |
| 129 fin.close() | |
| 130 fout.close() | |
| 131 | |
| 132 | |
| 133 if __name__ == "__main__": main() |
