diff specify.py @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 248b06e86022
children
line wrap: on
line diff
--- a/specify.py	Mon Jun 03 12:29:29 2013 -0400
+++ b/specify.py	Mon Jul 15 10:47:35 2013 -0400
@@ -1,147 +1,69 @@
 #!/usr/bin/env python
 
+import gd_util
 import sys
-import base64
-
-def parse_args(args):
-    if len(args) < 3:
-        usage()
-
-    input_file, output_file = args[1:3]
-
-    individuals = []
-    checkboxes = []
-    strings = []
-
-    for arg in args[3:]:
-        if ':' in arg:
-            arg_type, arg = arg.split(':', 1)
-        else:
-            print >> sys.stderr, "unknown argument:", arg
-            usage()
-
-        if arg_type == 'individual':
-            individuals.append(arg)
-        elif arg_type == 'checkbox':
-            checkboxes.append(arg)
-        elif arg_type == 'string':
-            strings.append(arg)
-        else:
-            print >> sys.stderr, "unknown argument:", arg
-            usage()
-
-    return input_file, output_file, individuals, checkboxes, strings
+from Population import Population
 
-def usage():
-    print >> sys.stderr, "Usage: %s <input> <output> [<individual:col:name> ...] [<checkbox:col:name> ...] [<string:base64> ...]" % (sys.argv[0])
-    sys.exit(1)
-
-def parse_individuals(individuals):
-    ind_col2name = {}
-    ind_name2col = {}
-
-    for individual in individuals:
-        if ':' in individual:
-            column, name = individual.split(':', 1)
-        else:
-            print >> sys.stderr, "invalid individual specification:", individual
-            usage()
+################################################################################
 
-        try:
-            column = int(column)
-        except:
-            print "individual column is not an integer:", individual
-            usage()
-
-        if column not in ind_col2name:
-            ind_col2name[column] = name
-        else:
-            if ind_col2name[column] != name:
-                print "duplicate individual column:", name, column, ind_col2name[column]
-                usage()
-
-        if name not in ind_name2col:
-            ind_name2col[name] = [column]
-        elif column not in ind_name2col[name]:
-            ind_name2col[name].append(column)
-
-    return ind_col2name, ind_name2col
-
-def parse_checkboxes(checkboxes, ind_col2name):
+def parse_string(str_arg, ind_token2col):
     columns = []
 
-    for checkbox in checkboxes:
-        if ':' in checkbox:
-            column, name = checkbox.split(':', 1)
-        else:
-            print >> sys.stderr, "invalid checkbox specification:", checkbox
-            usage()
+    string = gd_util.unwrap_string(str_arg)
+    tokens = find_tokens(string, ind_token2col)
 
-        try:
-            column = int(column)
-        except:
-            print "checkbox column is not an integer:", checkbox
-            usage()
-
-        if column not in ind_col2name:
-            print "individual not in SNP table:", name
-            usage()
-
-        if column not in columns:
-            columns.append(column)
+    for token in tokens:
+        col = ind_token2col[token]
+        if col not in columns:
+            columns.append(col)
 
     return columns
 
-def parse_strings(strings, ind_col2name, ind_name2col):
-    columns = []
-
-    for string in strings:
-        try:
-            decoded = base64.b64decode(string)
-        except:
-            print >> sys.stderr, "invalid base64 string:", string
-            usage()
-
-        names = find_names(decoded, ind_name2col.keys())
-        for name in names:
-            cols = ind_name2col[name]
-            if len(cols) == 1:
-                col = cols[0]
-                if col not in columns:
-                    columns.append(col)
-            else:
-                print >> sys.stderr, "name with multiple columns:", name
-                usage()
-
-    return columns
-
-def find_names(string, names):
+def find_tokens(string, tokens):
     rv = []
-    for name in names:
-        if name in string:
-            if name not in rv:
-                rv.append(name)
+    for token in tokens:
+        if token in string:
+            if token not in rv:
+                rv.append(token)
     return rv
 
+################################################################################
 
+if len(sys.argv) != 6:
+    gd_util.die('Usage')
 
+input, output, ind_arg, cb_arg, str_arg = sys.argv[1:]
 
-input_file, output_file, individuals, checkboxes, strings = parse_args(sys.argv)
-ind_col2name, ind_name2col = parse_individuals(individuals)
-cb_cols = parse_checkboxes(checkboxes, ind_col2name)
-str_cols = parse_strings(strings, ind_col2name, ind_name2col)
+p_total = Population()
+p_total.from_wrapped_dict(ind_arg)
+
+p_cb = Population()
+p_cb.from_wrapped_dict(cb_arg)
+
+if not p_total.is_superset(p_cb):
+    gd_util.die('There is a checked individual that does not appear in the SNP table')
+
+################################################################################
 
-out_cols = cb_cols
-for col in str_cols:
-    if col not in out_cols:
-        out_cols.append(col)
+ind_col2name = {}
+ind_token2col = {}
+for col in p_total.column_list():
+    individual = p_total.individual_with_column(col)
+    name = individual.name
+    ind_col2name[col] = name
+    first_token = name.split()[0]
+    if first_token not in ind_token2col:
+        ind_token2col[first_token] = col
+    else:
+        gd_util.die('duplicate first token: {0}'.format(first_token))
 
-with open(output_file, 'w') as fh:
-    for col in sorted(out_cols):
-        print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']])
+out_cols = p_cb.column_list()
+str_cols = parse_string(str_arg, ind_token2col)
+
+with open(output, 'w') as fh:
+    for col in sorted(ind_col2name.keys()):
+        if col in out_cols or col in str_cols:
+            print >> fh, '\t'.join([str(x) for x in [col, ind_col2name[col], '']])
 
 sys.exit(0)
 
-
-
-