diff tools/filters/randomlines.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/randomlines.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# Kanwei Li, 2010
+# Selects N random lines from a file and outputs to another file
+
+import random, sys
+
+def main():
+    infile = open(sys.argv[1], 'r')
+    total_lines = int(sys.argv[2])
+    
+    if total_lines < 1:
+        sys.stderr.write( "Must select at least one line." )
+        sys.exit()
+    
+    kept = []
+    n = 0
+    for line in infile:
+        line = line.rstrip("\n")
+        n += 1
+        if (n <= total_lines):
+            kept.append(line)
+        elif random.randint(1, n) <= total_lines:
+            kept.pop(random.randint(0, total_lines-1))
+            kept.append(line)
+    
+    if n < total_lines:
+        sys.stderr.write( "Error: asked to select more lines than there were in the file." )
+        sys.exit()
+        
+    open(sys.argv[3], 'w').write( "\n".join(kept) )
+    
+if __name__ == "__main__":
+    main()