annotate Population.py @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents 8997f2ca8c7a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
1 #!/usr/bin/env python
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
2
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
3 import OrderedDict
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
4 import base64
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
5 import json
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
6 import zlib
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
7
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
8 import sys
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
9
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
10 class Individual(object):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
11 __slots__ = ['_column', '_name', '_alias']
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
12
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
13 def __init__(self, column, name, alias=None):
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
14 self._column = int(column)
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
15 self._name = name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
16 self._alias = alias
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
17
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
18 @property
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
19 def column(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
20 return self._column
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
21
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
22 @property
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
23 def name(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
24 return self._name if self._alias is None else self._alias
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
25
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
26 @property
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
27 def alias(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
28 return self._alias
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
29
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
30 @alias.setter
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
31 def alias(self, alias):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
32 self._alias = alias
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
33
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
34 @property
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
35 def real_name(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
36 return self._name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
37
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
38 def __eq__(self, other):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
39 return self._column == other._column and self._name == other._name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
40
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
41 def __ne__(self, other):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
42 return not self.__eq__(other)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
43
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
44 def __repr__(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
45 return 'Individual: column={0} name={1} alias={2}'.format(self._column, self._name, self._alias)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
46
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
47
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
48 class Population(object):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
49 def __init__(self, name=None):
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
50 self._columns = OrderedDict.OrderedDict()
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
51 self._name = name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
52
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
53 @property
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
54 def name(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
55 return self._name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
56
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
57 @name.setter
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
58 def name(self, name):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
59 self._name = name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
60
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
61 def add_individual(self, individual, alias=None):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
62 if individual.column not in self._columns:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
63 self._columns[individual.column] = individual
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
64 elif self._columns[individual.column] == individual:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
65 # should should this be an error?
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
66 # should we replace the alias using this entry?
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
67 pass
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
68 else:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
69 raise 'Duplicate column: {0}'.format(individual)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
70
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
71 def is_superset(self, other):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
72 for column, other_individual in other._columns.items():
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
73 our_individual = self._columns.get(column)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
74 if our_individual is None or our_individual != other_individual:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
75 return False
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
76 return True
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
77
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
78 def is_disjoint(self, other):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
79 for column, our_individual in self._columns.items():
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
80 other_individual = other._columns.get(column)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
81 if other_individual is not None and other_individual == our_individual:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
82 return False
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
83 return True
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
84
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
85 def column_list(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
86 return self._columns.keys()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
87
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
88 def individual_with_column(self, column):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
89 if column in self._columns:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
90 return self._columns[column]
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
91 return None
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
92
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
93 def tag_list(self, delimiter=':'):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
94 entries = []
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
95 for column, individual in self._columns.iteritems():
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
96 first_token = individual.name.split()[0]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
97 entry = '{0}{1}{2}'.format(column, delimiter, first_token)
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
98 entries.append(entry)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
99 return entries
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
100
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
101 def to_string(self, delimiter=':', separator=' ', replace_names_with=None):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
102 entries = []
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
103 for column, individual in self._columns.items():
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
104 value = individual.name
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
105 if replace_names_with is not None:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
106 value = replace_names_with
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
107 entry = '{0}{1}{2}'.format(column, delimiter, value)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
108 entries.append(entry)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
109 return separator.join(entries)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
110
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
111 def __str__(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
112 return self.to_string()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
113
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
114 def from_population_file(self, filename):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
115 with open(filename) as fh:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
116 for line in fh:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
117 line = line.rstrip('\r\n')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
118 column, name, alias = line.split('\t')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
119 alias = alias.strip()
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
120 individual = Individual(column, name)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
121 if alias:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
122 individual.alias = alias
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
123 self.add_individual(individual)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
124
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
125 def from_tag_list(self, tag_list):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
126 for tag in tag_list:
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
127 column, name = tag.split(':')
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
128 individual = Individual(column, name)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
129 self.add_individual(individual)
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
130
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
131 def from_wrapped_dict(self, wrapped_dict):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
132 unwraped_dict = self.unwrap_dict(wrapped_dict)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
133 for name, column in unwraped_dict.iteritems():
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
134 individual = Individual(column, name)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
135 self.add_individual(individual)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
136
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
137 def unwrap_dict(self, wrapped_dict):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
138 decoded_value = self.decode_value(wrapped_dict)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
139 decompressed_value = self.decompress_value(decoded_value)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
140 def _decode_list(data):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
141 rv = []
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
142 for item in data:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
143 if isinstance(item, unicode):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
144 item = item.encode('utf-8')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
145 elif isinstance(item, list):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
146 item = _decode_list(item)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
147 elif isinstance(item, dict):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
148 item = _decode_dict(item)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
149 rv.append(item)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
150 return rv
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
151 def _decode_dict(data):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
152 rv = {}
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
153 for key, value in data.iteritems():
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
154 if isinstance(key, unicode):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
155 key = key.encode('utf-8')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
156 if isinstance(value, unicode):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
157 value = value.encode('utf-8')
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
158 elif isinstance(value, list):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
159 value = _decode_list(value)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
160 elif isinstance(value, dict):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
161 value = _decode_dict(value)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
162 rv[key] = value
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
163 return rv
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
164 unwrapped_dict = json.loads(decompressed_value, object_hook=_decode_dict)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
165 return unwrapped_dict
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
166
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
167 def decode_value(self, value):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
168 try:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
169 return base64.b64decode(value)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
170 except TypeError, message:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
171 print >> sys.stderr, 'base64.b64decode: {0}: {1}'.format(message, value)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
172 sys.exit(1)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
173
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
174 def decompress_value(self, value):
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
175 try:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
176 return zlib.decompress(value)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
177 except zlib.error, message:
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
178 print >> sys.stderr, 'zlib.decompress: {0}'.format(message)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
179 sys.exit(1)
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
180
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
181 def individual_names(self):
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
182 for column, individual in self._columns.items():
27
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
183 first_token = individual.name.split()[0]
8997f2ca8c7a Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents: 0
diff changeset
184 yield first_token
0
2c498d40ecde Uploaded
miller-lab
parents:
diff changeset
185