Mercurial > repos > xuebing > sampline
comparison sampline.py @ 1:fc8ab6f2276a default tip
Uploaded
author | xuebing |
---|---|
date | Thu, 08 Sep 2011 22:30:00 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:b7bc4f4399f0 | 1:fc8ab6f2276a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Sampling random records from a file. Each record is defined by a fixed number of lines. | |
5 | |
6 Usage: sampline.py [options] | |
7 | |
8 Options: | |
9 -h, --help show this help message and exit | |
10 -r, --replacement Sampling with replacement | |
11 -i INPUT, --input=INPUT | |
12 Input file | |
13 -o OUTPUT, --output=OUTPUT | |
14 Output file | |
15 -k NSAMPLE, --nSample=NSAMPLE | |
16 (required) number of records to be sampled/output | |
17 -m RECSIZE, --recSize=RECSIZE | |
18 (default=1) number of lines spanned by each record | |
19 -n NSKIP, --nSkip=NSKIP | |
20 (default=0) number of comment lines to skip at the | |
21 beginning | |
22 | |
23 example: | |
24 python sampline.py -i test10000.fastq -o out.txt --nSample=5 --recSize=4 --nSkip=0 --replacement | |
25 """ | |
26 | |
27 import optparse, string, random,sys,math,itertools | |
28 | |
29 assert sys.version_info[:2] >= ( 2, 4 ) | |
30 | |
31 def main(): | |
32 | |
33 # Parse command line | |
34 parser = optparse.OptionParser( usage="%prog [options] " ) | |
35 parser.add_option( "-r", "--replacement", action="store_true", dest="replacement",default=False, | |
36 help="Sampling with replacement" ) | |
37 parser.add_option( "-i", "--input", dest="input", default=None, | |
38 help="Input file" ) | |
39 parser.add_option( "-o", "--output", dest="output", default=None, | |
40 help="Output file" ) | |
41 parser.add_option("-k","--nSample", type='int',dest="nSample",default=None, | |
42 help="(required) number of records to be sampled/output" ) | |
43 parser.add_option("-m","--recSize", type='int',dest="recSize",default=1, | |
44 help="(default=1) number of lines spanned by each record" ) | |
45 parser.add_option("-n","--nSkip", type='int',dest="nSkip",default=0, | |
46 help="(default=0) number of comment lines to skip at the beginning") | |
47 options, args = parser.parse_args() | |
48 #assert options.region in ( 'coding', 'utr3', 'utr5', 'transcribed' ), "Invalid region argument" | |
49 | |
50 sampline(options.input,options.output,options.nSample,options.recSize,options.nSkip,options.replacement) | |
51 | |
52 def sample_wr(population, k): | |
53 "Chooses k random elements (with replacement) from a population" | |
54 n = len(population) | |
55 _random, _int = random.random, int # speed hack | |
56 return [_int(_random() * n) for i in itertools.repeat(None, k)] | |
57 | |
58 # num of lines | |
59 def readinput(filename): | |
60 try: | |
61 f = open (filename) | |
62 except: | |
63 print >> sys.stderr, "can't open file "+str(filename) | |
64 sys.exit(0) | |
65 | |
66 nline = 0 | |
67 for line in f: | |
68 nline = nline + 1 | |
69 f.close() | |
70 return nline | |
71 | |
72 def sampline(infile,outfile,nSample,recSize,nSkip,replacement): | |
73 # sample nSample records from file | |
74 # each record contains recSize lines | |
75 # skip the top nSkip lines | |
76 | |
77 nLine = readinput(infile) | |
78 print 'num of lines in input: ',nLine | |
79 print 'avoid sampling the first ',nSkip,' lines' | |
80 print 'lines per record: ',recSize | |
81 | |
82 if (nLine-nSkip) % recSize: | |
83 print >> sys.stderr, "the number of lines is not dividable by record size!" | |
84 sys.exit(0) | |
85 | |
86 nTotalRecords = (nLine-nSkip) / recSize | |
87 print "total number of records: ",nTotalRecords | |
88 | |
89 if replacement or nTotalRecords < nSample: | |
90 sel = sample_wr(range(nTotalRecords),nSample) | |
91 else: | |
92 sel = random.sample(range(nTotalRecords),nSample) | |
93 | |
94 #print len(sel), sorted(sel) | |
95 | |
96 # output | |
97 try: | |
98 fout = open (outfile,'w') | |
99 except: | |
100 print >> sys.stderr, "can't open file "+str(outfile) | |
101 sys.exit(0) | |
102 fin = open(infile) | |
103 n = 0 # index of line | |
104 rec = "" # to store all content of a record | |
105 nrepeat = 0 # number of times a record is sampled | |
106 curr_rec = -1 | |
107 for line in fin: | |
108 if n < nSkip: | |
109 n = n + 1 | |
110 fout.write(line) | |
111 continue | |
112 | |
113 if not (n-nSkip) % recSize:# a new record | |
114 # print the previous sampled record | |
115 for i in range(nrepeat): | |
116 fout.write(rec) | |
117 curr_rec = (n-nSkip)/recSize | |
118 nrepeat = sel.count(curr_rec) | |
119 if nrepeat: # sampled | |
120 rec = line | |
121 #print curr_rec,nrepeat | |
122 elif (n-nSkip)/recSize == curr_rec: | |
123 rec = rec + line | |
124 n = n + 1 | |
125 # if the last record is selected | |
126 if curr_rec == nTotalRecords-1: | |
127 for i in range(nrepeat): | |
128 fout.write(rec) | |
129 fin.close() | |
130 fout.close() | |
131 | |
132 | |
133 if __name__ == "__main__": main() |