diff make_gd_file.py @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_gd_file.py	Mon Jul 15 10:47:35 2013 -0400
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+
+import base64
+import json
+import math
+import re
+import sys
+
+identifier_regex = re.compile('[0-9A-Z_a-z]+$')
+
+def unwrap_column_names(string):
+    column_names = []
+    string = unwrap_string(string)
+    for line in string.split('\n'):
+        line = line.strip()
+        if is_identifier(line):
+            column_names.append(line)
+        else:
+            die('invalid column name format: {}'.format(line))
+    return column_names
+
+def unwrap_string(string):
+    try:
+        decoded = base64.b64decode(string)
+    except:
+        die('invalid base64 string: {}'.format(string))
+    return decoded
+
+def is_identifier(string):
+    match = identifier_regex.match(string)
+    if match:
+        return True
+    else:
+        return False
+
+def read_individual_names(filename):
+    tokens = []
+    names = []
+    with open(filename) as fh:
+        for line in fh:
+            line = line.rstrip('\r\n')
+            elems = line.split()
+
+            columns = len(elems)
+            if columns == 0:
+                continue
+
+            first_token = elems[0]
+
+            if columns == 1:
+                name = first_token
+            else:
+                keywords = ' '.join(elems[1:])
+                name = ' '.join([first_token, keywords])
+
+            if first_token not in tokens:
+                tokens.append(first_token)
+                names.append(name)
+            else:
+                die('duplicate first column entry in Names dataset: {}'.format(first_token))
+    return names
+
+def fold_line(line, maxlen, prefix):
+    prefix_len = len(prefix)
+
+    lines = []
+
+    while len(line) > maxlen:
+        split_points = []
+        state = 0
+        for i in range(maxlen - prefix_len):
+            c = line[i]
+            if state == 0:
+                if c == '"':
+                    state = 1
+                elif c in [ '{', ':', ',', '}', '[', ']' ]:
+                    split_points.append(i)
+            elif state == 1:
+                if c == '"':
+                    state = 0
+                elif c == '\\':
+                    state = 2
+            elif state == 2:
+                state = 1
+        idx = split_points[-1]
+        lines.append('{0}{1}'.format(prefix, line[:idx+1]))
+        line = line[idx+1:]
+
+    lines.append('{0}{1}'.format(prefix, line))
+
+    return lines
+
+def die(message):
+    print >> sys.stderr, message
+    sys.exit(1)
+
+################################################################################
+
+type_to_columns = {
+    'gd_snp':4,
+    'gd_genotype':1
+}
+
+if len(sys.argv) != 12:
+    print >> sys.stderr, 'Usage'
+    sys.exit(1)
+
+input, scaffold_col, pos_col, ref_col, rPos_col, preamble_arg, names, species_arg, dbkey, output_type, output = sys.argv[1:12]
+
+preamble_column_names = unwrap_column_names(preamble_arg)
+first_individual_column = len(preamble_column_names) + 1
+
+individual_names = read_individual_names(names)
+
+species = unwrap_string(species_arg)
+if not is_identifier(species):
+    die('invalid species format: {}'.format(species))
+
+if not output_type in type_to_columns:
+    die('unknown output type: {}'.format(output_type))
+columns_per_individual = type_to_columns[output_type]
+
+jdict = {}
+
+column_names = preamble_column_names[:]
+for i in range(1, len(individual_names) + 1):
+    if output_type == 'gd_snp':
+        column_names.append('{}A'.format(i))
+        column_names.append('{}B'.format(i))
+        column_names.append('{}G'.format(i))
+        column_names.append('{}Q'.format(i))
+    elif output_type == 'gd_genotype':
+        column_names.append('{}G'.format(i))
+    else:
+        die('unknown output type: {}'.format(output_type))
+
+jdict['column_names'] = column_names
+
+individuals = []
+
+for pos, individual in enumerate(individual_names):
+    col = first_individual_column + pos * columns_per_individual
+    individuals.append([individual, col])
+
+jdict['individuals'] = individuals
+
+jdict['scaffold'] = int(scaffold_col)
+jdict['pos'] = int(pos_col)
+jdict['ref'] = int(ref_col)
+jdict['rPos'] = int(rPos_col)
+
+jdict['species'] = species
+jdict['dbkey'] = dbkey
+
+json_string = json.dumps(jdict, separators=(',',':'), sort_keys=True)
+
+min_cols = len(column_names)
+pos_col = int(pos_col) - 1
+rPos_col = int(rPos_col) - 1
+
+def is_int(string):
+    try:
+        int(string)
+        return True
+    except ValueError:
+        return False
+
+with open(output, 'w') as ofh:
+    lines = fold_line(json_string, 200, '#')
+    for line in lines:
+        print >> ofh, line
+
+    with open(input) as fh:
+        line_number = 0
+        for line in fh:
+            line_number += 1
+            if line[0] == '#':
+                continue
+            line = line.rstrip('\r\n')
+            elems = line.split('\t')
+            if len(elems) < min_cols:
+                die('Too few columns on line {0} of input file.  Expecting {1}, saw {2}.'.format(line_number, min_cols, len(elems)))
+            if not is_int(elems[pos_col]):
+                die('bad pos on line {0} column {1} of input file: {2}'.format(line_number, pos_col+1, elems[pos_col]))
+            if not is_int(elems[rPos_col]):
+                die('bad rPos on line {0} column {1} of input file: {2}'.format(line_number, rPos_col+1, elems[rPos_col]))
+            print >> ofh, line