Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/plinkbinJZ.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python2.4 | |
2 """ | |
3 """ | |
4 | |
5 import optparse,os,subprocess,gzip,struct,time,commands | |
6 from array import array | |
7 | |
8 #from AIMS import util | |
9 #from pga import util as pgautil | |
10 | |
11 __FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $' | |
12 | |
13 VERBOSE = True | |
14 | |
15 MISSING_ALLELES = set(['N', '0', '.', '-','']) | |
16 | |
17 AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)]) | |
18 | |
19 MAGIC_BYTE1 = '00110110' | |
20 MAGIC_BYTE2 = '11011000' | |
21 FORMAT_SNP_MAJOR_BYTE = '10000000' | |
22 FORMAT_IND_MAJOR_BYTE = '00000000' | |
23 MAGIC1 = (0, 3, 1, 2) | |
24 MAGIC2 = (3, 1, 2, 0) | |
25 FORMAT_SNP_MAJOR = (2, 0, 0, 0) | |
26 FORMAT_IND_MAJOR = (0, 0, 0, 0) | |
27 HEADER_LENGTH = 3 | |
28 | |
29 HOM0 = 3 | |
30 HOM1 = 0 | |
31 MISS = 2 | |
32 HET = 1 | |
33 HOM0_GENO = (0, 0) | |
34 HOM1_GENO = (1, 1) | |
35 HET_GENO = (0, 1) | |
36 MISS_GENO = (-9, -9) | |
37 | |
38 GENO_TO_GCODE = { | |
39 HOM0_GENO: HOM0, | |
40 HET_GENO: HET, | |
41 HOM1_GENO: HOM1, | |
42 MISS_GENO: MISS, | |
43 } | |
44 | |
45 CHROM_REPLACE = { | |
46 'X': '23', | |
47 'Y': '24', | |
48 'XY': '25', | |
49 'MT': '26', | |
50 'M': '26', | |
51 } | |
52 | |
53 MAP_LINE_EXCEPTION_TEXT = """ | |
54 One or more lines in the *.map file has only three fields. | |
55 The line was: | |
56 | |
57 %s | |
58 | |
59 If you are running rgGRR through EPMP, this is usually a | |
60 sign that you are using an old version of the map file. | |
61 You can correct the problem by re-running Subject QC. If | |
62 you have already tried this, please contact the developers, | |
63 or file a bug. | |
64 """ | |
65 | |
66 INT_TO_GCODE = { | |
67 0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)), | |
68 4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)), | |
69 8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)), | |
70 12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)), | |
71 16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)), | |
72 20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)), | |
73 24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)), | |
74 28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)), | |
75 32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)), | |
76 36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)), | |
77 40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)), | |
78 44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)), | |
79 48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)), | |
80 52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)), | |
81 56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)), | |
82 60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)), | |
83 64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)), | |
84 68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)), | |
85 72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)), | |
86 76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)), | |
87 80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)), | |
88 84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)), | |
89 88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)), | |
90 92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)), | |
91 96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)), | |
92 100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)), | |
93 104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)), | |
94 108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)), | |
95 112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)), | |
96 116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)), | |
97 120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)), | |
98 124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)), | |
99 128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)), | |
100 132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)), | |
101 136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)), | |
102 140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)), | |
103 144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)), | |
104 148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)), | |
105 152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)), | |
106 156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)), | |
107 160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)), | |
108 164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)), | |
109 168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)), | |
110 172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)), | |
111 176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)), | |
112 180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)), | |
113 184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)), | |
114 188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)), | |
115 192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)), | |
116 196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)), | |
117 200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)), | |
118 204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)), | |
119 208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)), | |
120 212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)), | |
121 216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)), | |
122 220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)), | |
123 224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)), | |
124 228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)), | |
125 232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)), | |
126 236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)), | |
127 240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)), | |
128 244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)), | |
129 248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)), | |
130 252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)), | |
131 } | |
132 | |
133 GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()]) | |
134 | |
135 ### Exceptions | |
136 class DuplicateMarkerInMapFile(Exception): pass | |
137 class MapLineTooShort(Exception): pass | |
138 class ThirdAllele(Exception): pass | |
139 class PedError(Exception): pass | |
140 class BadMagic(Exception): | |
141 """ Raised when one of the MAGIC bytes in a bed file does not match | |
142 """ | |
143 pass | |
144 class BedError(Exception): | |
145 """ Raised when parsing a bed file runs into problems | |
146 """ | |
147 pass | |
148 class UnknownGenocode(Exception): | |
149 """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?) | |
150 """ | |
151 pass | |
152 class UnknownGeno(Exception): pass | |
153 | |
154 ### Utility functions | |
155 | |
156 def timenow(): | |
157 """return current time as a string | |
158 """ | |
159 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
160 | |
161 def ceiling(n, k): | |
162 ''' Return the least multiple of k which is greater than n | |
163 ''' | |
164 m = n % k | |
165 if m == 0: | |
166 return n | |
167 else: | |
168 return n + k - m | |
169 | |
170 def nbytes(n): | |
171 ''' Return the number of bytes required for n subjects | |
172 ''' | |
173 return 2*ceiling(n, 4)/8 | |
174 | |
175 ### Primary module functionality | |
176 class LPed: | |
177 """ The uber-class for processing the Linkage-format *.ped/*.map files | |
178 """ | |
179 def __init__(self, base): | |
180 self.base = base | |
181 self._ped = Ped('%s.ped' % (self.base)) | |
182 self._map = Map('%s.map' % (self.base)) | |
183 | |
184 self._markers = {} | |
185 self._ordered_markers = [] | |
186 self._marker_allele_lookup = {} | |
187 self._autosomal_indices = set() | |
188 | |
189 self._subjects = {} | |
190 self._ordered_subjects = [] | |
191 | |
192 self._genotypes = [] | |
193 | |
194 def parse(self): | |
195 """ | |
196 """ | |
197 if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow()) | |
198 self._map.parse() | |
199 self._markers = self._map._markers | |
200 self._ordered_markers = self._map._ordered_markers | |
201 self._autosomal_indices = self._map._autosomal_indices | |
202 | |
203 self._ped.parse(self._ordered_markers) | |
204 self._subjects = self._ped._subjects | |
205 self._ordered_subjects = self._ped._ordered_subjects | |
206 self._genotypes = self._ped._genotypes | |
207 self._marker_allele_lookup = self._ped._marker_allele_lookup | |
208 | |
209 ### Adjust self._markers based on the allele information | |
210 ### we got from parsing the ped file | |
211 for m, name in enumerate(self._ordered_markers): | |
212 a1, a2 = self._marker_allele_lookup[m][HET] | |
213 self._markers[name][-2] = a1 | |
214 self._markers[name][-1] = a2 | |
215 if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow()) | |
216 | |
217 def getSubjectInfo(self, fid, oiid): | |
218 """ | |
219 """ | |
220 return self._subject_info[(fid, oiid)] | |
221 | |
222 def getSubjectInfoByLine(self, line): | |
223 """ | |
224 """ | |
225 return self._subject_info[self._ordered_subjects[line]] | |
226 | |
227 def getGenotypesByIndices(self, s, mlist, format): | |
228 """ needed for grr if lped - deprecated but.. | |
229 """ | |
230 mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ? | |
231 raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)]) | |
232 if format == 'raw': | |
233 return raw_array | |
234 elif format == 'ref': | |
235 result = array('i', [0]*len(mlist)) | |
236 for m, gcode in enumerate(raw_array): | |
237 if gcode == HOM0: | |
238 nref = 3 | |
239 elif gcode == HET: | |
240 nref = 2 | |
241 elif gcode == HOM1: | |
242 nref = 1 | |
243 else: | |
244 nref = 0 | |
245 result[m] = nref | |
246 return result | |
247 else: | |
248 result = [] | |
249 for m, gcode in enumerate(raw_array): | |
250 result.append(self._marker_allele_lookup[m][gcode]) | |
251 return result | |
252 | |
253 def writebed(self, base): | |
254 """ | |
255 """ | |
256 dst_name = '%s.fam' % (base) | |
257 print 'Writing pedigree information to [ %s ]' % (dst_name) | |
258 dst = open(dst_name, 'w') | |
259 for skey in self._ordered_subjects: | |
260 (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey] | |
261 dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe)) | |
262 dst.close() | |
263 | |
264 dst_name = '%s.bim' % (base) | |
265 print 'Writing map (extended format) information to [ %s ]' % (dst_name) | |
266 dst = open(dst_name, 'w') | |
267 for m, marker in enumerate(self._ordered_markers): | |
268 chrom, name, genpos, abspos, a1, a2 = self._markers[marker] | |
269 dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2)) | |
270 dst.close() | |
271 | |
272 bed_name = '%s.bed' % (base) | |
273 print 'Writing genotype bitfile to [ %s ]' % (bed_name) | |
274 print 'Using (default) SNP-major mode' | |
275 bed = open(bed_name, 'w') | |
276 | |
277 ### Write the 3 header bytes | |
278 bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2))) | |
279 bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2))) | |
280 bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2))) | |
281 | |
282 ### Calculate how many "pad bits" we should add after the last subject | |
283 nsubjects = len(self._ordered_subjects) | |
284 nmarkers = len(self._ordered_markers) | |
285 total_bytes = nbytes(nsubjects) | |
286 nbits = nsubjects * 2 | |
287 pad_nibbles = ((total_bytes * 8) - nbits)/2 | |
288 pad = array('i', [0]*pad_nibbles) | |
289 | |
290 ### And now write genotypes to the file | |
291 for m in xrange(nmarkers): | |
292 geno = self._genotypes[m] | |
293 geno.extend(pad) | |
294 bytes = len(geno)/4 | |
295 for b in range(bytes): | |
296 idx = b*4 | |
297 gcode = tuple(geno[idx:idx+4]) | |
298 try: | |
299 byte = struct.pack('B', GCODE_TO_INT[gcode]) | |
300 except KeyError: | |
301 print m, b, gcode | |
302 raise | |
303 bed.write(byte) | |
304 bed.close() | |
305 | |
306 def autosomal_indices(self): | |
307 """ Return the indices of markers in this ped/map that are autosomal. | |
308 This is used by rgGRR so that it can select a random set of markers | |
309 from the autosomes (sex chroms screw up the plot) | |
310 """ | |
311 return self._autosomal_indices | |
312 | |
313 class Ped: | |
314 def __init__(self, path): | |
315 self.path = path | |
316 self._subjects = {} | |
317 self._ordered_subjects = [] | |
318 self._genotypes = [] | |
319 self._marker_allele_lookup = {} | |
320 | |
321 def lineCount(self,infile): | |
322 """ count the number of lines in a file - efficiently using wget | |
323 """ | |
324 return int(commands.getoutput('wc -l %s' % (infile)).split()[0]) | |
325 | |
326 | |
327 def parse(self, markers): | |
328 """ Parse a given file -- this needs to be memory-efficient so that large | |
329 files can be parsed (~1 million markers on ~5000 subjects?). It | |
330 should also be fast, if possible. | |
331 """ | |
332 | |
333 ### Find out how many lines are in the file so we can ... | |
334 nsubjects = self.lineCount(self.path) | |
335 ### ... Pre-allocate the genotype arrays | |
336 nmarkers = len(markers) | |
337 _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)] | |
338 self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)] | |
339 | |
340 if self.path.endswith('.gz'): | |
341 pfile = gzip.open(self.path, 'r') | |
342 else: | |
343 pfile = open(self.path, 'r') | |
344 | |
345 for s, line in enumerate(pfile): | |
346 line = line.strip() | |
347 if not line: | |
348 continue | |
349 | |
350 fid, iid, did, mid, sex, phe, genos = line.split(None, 6) | |
351 sid = iid.split('.')[0] | |
352 d_sid = did.split('.')[0] | |
353 m_sid = mid.split('.')[0] | |
354 | |
355 skey = (fid, iid) | |
356 self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) | |
357 self._ordered_subjects.append(skey) | |
358 | |
359 genotypes = genos.split() | |
360 | |
361 for m, marker in enumerate(markers): | |
362 idx = m*2 | |
363 a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m | |
364 s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m | |
365 | |
366 ### FIXME: I think this can still be faster, and simpler to read | |
367 # Two pieces of logic intertwined here: first, we need to code | |
368 # this genotype as HOM0, HOM1, HET or MISS. Second, we need to | |
369 # keep an ongoing record of the genotypes seen for this marker | |
370 if a1 == a2: | |
371 if a1 in MISSING_ALLELES: | |
372 geno = MISS_GENO | |
373 else: | |
374 if s1 == '0': | |
375 seen[0] = a1 | |
376 elif s1 == a1 or s2 == a2: | |
377 pass | |
378 elif s2 == '0': | |
379 seen[1] = a1 | |
380 else: | |
381 raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen))) | |
382 | |
383 if a1 == seen[0]: | |
384 geno = HOM0_GENO | |
385 elif a1 == seen[1]: | |
386 geno = HOM1_GENO | |
387 else: | |
388 raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen))) | |
389 elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES: | |
390 geno = MISS_GENO | |
391 else: | |
392 geno = HET_GENO | |
393 if s1 == '0': | |
394 seen[0] = a1 | |
395 seen[1] = a2 | |
396 elif s2 == '0': | |
397 if s1 == a1: | |
398 seen[1] = a2 | |
399 elif s1 == a2: | |
400 seen[1] = a1 | |
401 else: | |
402 raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen))) | |
403 else: | |
404 if sorted(seen) != sorted((a1, a2)): | |
405 raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen))) | |
406 | |
407 gcode = GENO_TO_GCODE.get(geno, None) | |
408 if gcode is None: | |
409 raise UnknownGeno(str(geno)) | |
410 self._genotypes[m][s] = gcode | |
411 | |
412 # Build the _marker_allele_lookup table | |
413 for m, alleles in enumerate(_marker_alleles): | |
414 if len(alleles) == 2: | |
415 a1, a2 = alleles | |
416 elif len(alleles) == 1: | |
417 a1 = alleles[0] | |
418 a2 = '0' | |
419 else: | |
420 print 'All alleles blank for %s: %s' % (m, str(alleles)) | |
421 raise | |
422 | |
423 self._marker_allele_lookup[m] = { | |
424 HOM0: (a2, a2), | |
425 HOM1: (a1, a1), | |
426 HET : (a1, a2), | |
427 MISS: ('0','0'), | |
428 } | |
429 | |
430 if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path) | |
431 | |
432 class Map: | |
433 def __init__(self, path=None): | |
434 self.path = path | |
435 self._markers = {} | |
436 self._ordered_markers = [] | |
437 self._autosomal_indices = set() | |
438 | |
439 def __len__(self): | |
440 return len(self._markers) | |
441 | |
442 def parse(self): | |
443 """ Parse a Linkage-format map file | |
444 """ | |
445 if self.path.endswith('.gz'): | |
446 fh = gzip.open(self.path, 'r') | |
447 else: | |
448 fh = open(self.path, 'r') | |
449 | |
450 for i, line in enumerate(fh): | |
451 line = line.strip() | |
452 if not line: | |
453 continue | |
454 | |
455 fields = line.split() | |
456 if len(fields) < 4: | |
457 raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields))) | |
458 else: | |
459 chrom, name, genpos, abspos = fields | |
460 if name in self._markers: | |
461 raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path)) | |
462 abspos = int(abspos) | |
463 if abspos < 0: | |
464 continue | |
465 if chrom in AUTOSOMES: | |
466 self._autosomal_indices.add(i) | |
467 chrom = CHROM_REPLACE.get(chrom, chrom) | |
468 self._markers[name] = [chrom, name, genpos, abspos, None, None] | |
469 self._ordered_markers.append(name) | |
470 fh.close() | |
471 if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path) | |
472 | |
473 class BPed: | |
474 """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam | |
475 """ | |
476 def __init__(self, base): | |
477 self.base = base | |
478 self._bed = Bed('%s.bed' % (self.base)) | |
479 self._bim = Bim('%s.bim' % (self.base)) | |
480 self._fam = Fam('%s.fam' % (self.base)) | |
481 | |
482 self._markers = {} | |
483 self._ordered_markers = [] | |
484 self._marker_allele_lookup = {} | |
485 self._autosomal_indices = set() | |
486 | |
487 self._subjects = {} | |
488 self._ordered_subjects = [] | |
489 | |
490 self._genotypes = [] | |
491 | |
492 def parse(self, quick=False): | |
493 """ | |
494 """ | |
495 self._quick = quick | |
496 | |
497 self._bim.parse() | |
498 self._markers = self._bim._markers | |
499 self._ordered_markers = self._bim._ordered_markers | |
500 self._marker_allele_lookup = self._bim._marker_allele_lookup | |
501 self._autosomal_indices = self._bim._autosomal_indices | |
502 | |
503 self._fam.parse() | |
504 self._subjects = self._fam._subjects | |
505 self._ordered_subjects = self._fam._ordered_subjects | |
506 | |
507 self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick) | |
508 self._bedf = self._bed._fh | |
509 self._genotypes = self._bed._genotypes | |
510 self.nsubjects = len(self._ordered_subjects) | |
511 self.nmarkers = len(self._ordered_markers) | |
512 self._bytes_per_marker = nbytes(self.nsubjects) | |
513 | |
514 def writeped(self, path=None): | |
515 """ | |
516 """ | |
517 path = self.path = path or self.path | |
518 | |
519 map_name = self.path.replace('.bed', '.map') | |
520 print 'Writing map file [ %s ]' % (map_name) | |
521 dst = open(map_name, 'w') | |
522 for m in self._ordered_markers: | |
523 chrom, snp, genpos, abspos, a1, a2 = self._markers[m] | |
524 dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos)) | |
525 dst.close() | |
526 | |
527 ped_name = self.path.replace('.bed', '.ped') | |
528 print 'Writing ped file [ %s ]' % (ped_name) | |
529 ped = open(ped_name, 'w') | |
530 firstyikes = False | |
531 for s, skey in enumerate(self._ordered_subjects): | |
532 idx = s*2 | |
533 (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey] | |
534 ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe)) | |
535 genotypes_for_subject = self.getGenotypesForSubject(s) | |
536 for m, snp in enumerate(self._ordered_markers): | |
537 #a1, a2 = self.getGenotypeByIndices(s, m) | |
538 a1,a2 = genotypes_for_subject[m] | |
539 ped.write(' %s %s' % (a1, a2)) | |
540 ped.write('\n') | |
541 ped.close() | |
542 | |
543 def getGenotype(self, subject, marker): | |
544 """ Retrieve a genotype for a particular subject/marker pair | |
545 """ | |
546 m = self._ordered_markers.index(marker) | |
547 s = self._ordered_subjects.index(subject) | |
548 return self.getGenotypeByIndices(s, m) | |
549 | |
550 def getGenotypesForSubject(self, s, raw=False): | |
551 """ Returns list of genotypes for all m markers | |
552 for subject s. If raw==True, then an array | |
553 of raw integer gcodes is returned instead | |
554 """ | |
555 if self._quick: | |
556 nmarkers = len(self._markers) | |
557 raw_array = array('i', [0]*nmarkers) | |
558 seek_nibble = s % 4 | |
559 for m in xrange(nmarkers): | |
560 seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH | |
561 self._bedf.seek(seek_byte) | |
562 geno = struct.unpack('B', self._bedf.read(1))[0] | |
563 quartet = INT_TO_GCODE[geno] | |
564 gcode = quartet[seek_nibble] | |
565 raw_array[m] = gcode | |
566 else: | |
567 raw_array = array('i', [row[s] for row in self._genotypes]) | |
568 | |
569 if raw: | |
570 return raw_array | |
571 else: | |
572 result = [] | |
573 for m, gcode in enumerate(raw_array): | |
574 result.append(self._marker_allele_lookup[m][gcode]) | |
575 return result | |
576 | |
577 def getGenotypeByIndices(self, s, m): | |
578 """ | |
579 """ | |
580 if self._quick: | |
581 # Determine which byte we need to seek to, and | |
582 # which nibble within the byte we need | |
583 seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH | |
584 seek_nibble = s % 4 | |
585 self._bedf.seek(seek_byte) | |
586 geno = struct.unpack('B', self._bedf.read(1))[0] | |
587 quartet = INT_TO_GCODE[geno] | |
588 gcode = quartet[seek_nibble] | |
589 else: | |
590 # Otherwise, just grab the genotypes from the | |
591 # list of arrays | |
592 genos_for_marker = self._genotypes[m] | |
593 gcode = genos_for_marker[s] | |
594 | |
595 return self._marker_allele_lookup[m][gcode] | |
596 | |
597 def getGenotypesByIndices(self, s, mlist, format): | |
598 """ | |
599 """ | |
600 if self._quick: | |
601 raw_array = array('i', [0]*len(mlist)) | |
602 seek_nibble = s % 4 | |
603 for i,m in enumerate(mlist): | |
604 seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH | |
605 self._bedf.seek(seek_byte) | |
606 geno = struct.unpack('B', self._bedf.read(1))[0] | |
607 quartet = INT_TO_GCODE[geno] | |
608 gcode = quartet[seek_nibble] | |
609 raw_array[i] = gcode | |
610 mlist = set(mlist) | |
611 else: | |
612 mlist = set(mlist) | |
613 raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist]) | |
614 | |
615 if format == 'raw': | |
616 return raw_array | |
617 elif format == 'ref': | |
618 result = array('i', [0]*len(mlist)) | |
619 for m, gcode in enumerate(raw_array): | |
620 if gcode == HOM0: | |
621 nref = 3 | |
622 elif gcode == HET: | |
623 nref = 2 | |
624 elif gcode == HOM1: | |
625 nref = 1 | |
626 else: | |
627 nref = 0 | |
628 result[m] = nref | |
629 return result | |
630 else: | |
631 result = [] | |
632 for m, gcode in enumerate(raw_array): | |
633 result.append(self._marker_allele_lookup[m][gcode]) | |
634 return result | |
635 | |
636 def getSubject(self, s): | |
637 """ | |
638 """ | |
639 skey = self._ordered_subjects[s] | |
640 return self._subjects[skey] | |
641 | |
642 def autosomal_indices(self): | |
643 """ Return the indices of markers in this ped/map that are autosomal. | |
644 This is used by rgGRR so that it can select a random set of markers | |
645 from the autosomes (sex chroms screw up the plot) | |
646 """ | |
647 return self._autosomal_indices | |
648 | |
649 class Bed: | |
650 | |
651 def __init__(self, path): | |
652 self.path = path | |
653 self._genotypes = [] | |
654 self._fh = None | |
655 | |
656 def parse(self, subjects, markers, quick=False): | |
657 """ Parse the bed file, indicated either by the path parameter, | |
658 or as the self.path indicated in __init__. If quick is | |
659 True, then just parse the bim and fam, then genotypes will | |
660 be looked up dynamically by indices | |
661 """ | |
662 self._quick = quick | |
663 | |
664 ordered_markers = markers | |
665 ordered_subjects = subjects | |
666 nsubjects = len(ordered_subjects) | |
667 nmarkers = len(ordered_markers) | |
668 | |
669 bed = open(self.path, 'rb') | |
670 self._fh = bed | |
671 | |
672 byte1 = bed.read(1) | |
673 byte2 = bed.read(1) | |
674 byte3 = bed.read(1) | |
675 format_flag = struct.unpack('B', byte3)[0] | |
676 | |
677 h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]]) | |
678 h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]]) | |
679 h3 = tuple(INT_TO_GCODE[format_flag]) | |
680 | |
681 if h1 != MAGIC1 or h2 != MAGIC2: | |
682 raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2)) | |
683 if format_flag: | |
684 print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3) | |
685 else: | |
686 raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3) | |
687 | |
688 print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects) | |
689 | |
690 ### If quick mode was specified, we're done ... | |
691 self._quick = quick | |
692 if quick: | |
693 return | |
694 | |
695 ### ... Otherwise, parse genotypes into an array, and append that | |
696 ### array to self._genotypes | |
697 ngcodes = ceiling(nsubjects, 4) | |
698 bytes_per_marker = nbytes(nsubjects) | |
699 for m in xrange(nmarkers): | |
700 genotype_array = array('i', [-1]*(ngcodes)) | |
701 for byte in xrange(bytes_per_marker): | |
702 intval = struct.unpack('B', bed.read(1))[0] | |
703 idx = byte*4 | |
704 genotype_array[idx:idx+4] = INT_TO_GCODE[intval] | |
705 self._genotypes.append(genotype_array) | |
706 | |
707 class Bim: | |
708 def __init__(self, path): | |
709 """ | |
710 """ | |
711 self.path = path | |
712 self._markers = {} | |
713 self._ordered_markers = [] | |
714 self._marker_allele_lookup = {} | |
715 self._autosomal_indices = set() | |
716 | |
717 def parse(self): | |
718 """ | |
719 """ | |
720 print 'Reading map (extended format) from [ %s ]' % (self.path) | |
721 bim = open(self.path, 'r') | |
722 for m, line in enumerate(bim): | |
723 chrom, snp, gpos, apos, a1, a2 = line.strip().split() | |
724 self._markers[snp] = (chrom, snp, gpos, apos, a1, a2) | |
725 self._marker_allele_lookup[m] = { | |
726 HOM0: (a2, a2), | |
727 HOM1: (a1, a1), | |
728 HET : (a1, a2), | |
729 MISS: ('0','0'), | |
730 } | |
731 self._ordered_markers.append(snp) | |
732 if chrom in AUTOSOMES: | |
733 self._autosomal_indices.add(m) | |
734 bim.close() | |
735 print '%s markers to be included from [ %s ]' % (m+1, self.path) | |
736 | |
737 class Fam: | |
738 def __init__(self, path): | |
739 """ | |
740 """ | |
741 self.path = path | |
742 self._subjects = {} | |
743 self._ordered_subjects = [] | |
744 | |
745 def parse(self): | |
746 """ | |
747 """ | |
748 print 'Reading pedigree information from [ %s ]' % (self.path) | |
749 fam = open(self.path, 'r') | |
750 for s, line in enumerate(fam): | |
751 fid, iid, did, mid, sex, phe = line.strip().split() | |
752 sid = iid.split('.')[0] | |
753 d_sid = did.split('.')[0] | |
754 m_sid = mid.split('.')[0] | |
755 skey = (fid, iid) | |
756 self._ordered_subjects.append(skey) | |
757 self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) | |
758 fam.close() | |
759 print '%s individuals read from [ %s ]' % (s+1, self.path) | |
760 | |
761 ### Command-line functionality and testing | |
762 def test(arg): | |
763 ''' | |
764 ''' | |
765 | |
766 import time | |
767 | |
768 if arg == 'CAMP_AFFY.ped': | |
769 print 'Testing bed.parse(quick=True)' | |
770 s = time.time() | |
771 bed = Bed(arg.replace('.ped', '.bed')) | |
772 bed.parse(quick=True) | |
773 print bed.getGenotype(('400118', '10300283'), 'rs2000467') | |
774 print bed.getGenotype(('400118', '10101384'), 'rs2294019') | |
775 print bed.getGenotype(('400121', '10101149'), 'rs2294019') | |
776 print bed.getGenotype(('400123', '10200290'), 'rs2294019') | |
777 assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4') | |
778 e = time.time() | |
779 print 'e-s = %s\n' % (e-s) | |
780 | |
781 print 'Testing bed.parse' | |
782 s = time.time() | |
783 bed = BPed(arg) | |
784 bed.parse(quick=False) | |
785 e = time.time() | |
786 print 'e-s = %s\n' % (e-s) | |
787 | |
788 print 'Testing bed.writeped' | |
789 s = time.time() | |
790 outname = '%s_BEDTEST' % (arg) | |
791 bed.writeped(outname) | |
792 e = time.time() | |
793 print 'e-s = %s\n' % (e-s) | |
794 del(bed) | |
795 | |
796 print 'Testing ped.parse' | |
797 s = time.time() | |
798 ped = LPed(arg) | |
799 ped.parse() | |
800 e = time.time() | |
801 print 'e-s = %s\n' % (e-s) | |
802 | |
803 print 'Testing ped.writebed' | |
804 s = time.time() | |
805 outname = '%s_PEDTEST' % (arg) | |
806 ped.writebed(outname) | |
807 e = time.time() | |
808 print 'e-s = %s\n' % (e-s) | |
809 del(ped) | |
810 | |
811 def profile_bed(arg): | |
812 """ | |
813 """ | |
814 bed = BPed(arg) | |
815 bed.parse(quick=False) | |
816 outname = '%s_BEDPROFILE' % (arg) | |
817 bed.writeped(outname) | |
818 | |
819 def profile_ped(arg): | |
820 """ | |
821 """ | |
822 ped = LPed(arg) | |
823 ped.parse() | |
824 outname = '%s_PEDPROFILE' % (arg) | |
825 ped.writebed(outname) | |
826 | |
827 if __name__ == '__main__': | |
828 """ Run as a command-line, this script should get one or more arguments, | |
829 each one a ped file to be parsed with the PedParser (unit tests?) | |
830 """ | |
831 op = optparse.OptionParser() | |
832 op.add_option('--profile-bed', action='store_true', default=False) | |
833 op.add_option('--profile-ped', action='store_true', default=False) | |
834 opts, args = op.parse_args() | |
835 | |
836 if opts.profile_bed: | |
837 import profile | |
838 import pstats | |
839 profile.run('profile_bed(args[0])', 'fooprof') | |
840 p = pstats.Stats('fooprof') | |
841 p.sort_stats('cumulative').print_stats(10) | |
842 elif opts.profile_ped: | |
843 import profile | |
844 import pstats | |
845 profile.run('profile_ped(args[0])', 'fooprof') | |
846 p = pstats.Stats('fooprof') | |
847 p.sort_stats('cumulative').print_stats(10) | |
848 else: | |
849 for arg in args: | |
850 test(arg) | |
851 | |
852 ### Code used to generate the INT_TO_GCODE dictionary | |
853 #print '{\n ', | |
854 #for i in range(256): | |
855 # b = INT2BIN[i] | |
856 # ints = [] | |
857 # s = str(i).rjust(3) | |
858 # #print b | |
859 # for j in range(4): | |
860 # idx = j*2 | |
861 # #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2) | |
862 # ints.append(int(b[idx:idx+2], 2)) | |
863 # print '%s: array(\'i\', %s),' % (s,tuple(ints)), | |
864 # if i > 0 and (i+1) % 4 == 0: | |
865 # print '\n ', | |
866 #print '}' | |
867 | |
868 |