Mercurial > repos > xuebing > sharplabtool
diff tools/rgenetics/plinkbinJZ.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/rgenetics/plinkbinJZ.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,868 @@ +#!/usr/bin/env python2.4 +""" +""" + +import optparse,os,subprocess,gzip,struct,time,commands +from array import array + +#from AIMS import util +#from pga import util as pgautil + +__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $' + +VERBOSE = True + +MISSING_ALLELES = set(['N', '0', '.', '-','']) + +AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)]) + +MAGIC_BYTE1 = '00110110' +MAGIC_BYTE2 = '11011000' +FORMAT_SNP_MAJOR_BYTE = '10000000' +FORMAT_IND_MAJOR_BYTE = '00000000' +MAGIC1 = (0, 3, 1, 2) +MAGIC2 = (3, 1, 2, 0) +FORMAT_SNP_MAJOR = (2, 0, 0, 0) +FORMAT_IND_MAJOR = (0, 0, 0, 0) +HEADER_LENGTH = 3 + +HOM0 = 3 +HOM1 = 0 +MISS = 2 +HET = 1 +HOM0_GENO = (0, 0) +HOM1_GENO = (1, 1) +HET_GENO = (0, 1) +MISS_GENO = (-9, -9) + +GENO_TO_GCODE = { + HOM0_GENO: HOM0, + HET_GENO: HET, + HOM1_GENO: HOM1, + MISS_GENO: MISS, + } + +CHROM_REPLACE = { + 'X': '23', + 'Y': '24', + 'XY': '25', + 'MT': '26', + 'M': '26', +} + +MAP_LINE_EXCEPTION_TEXT = """ +One or more lines in the *.map file has only three fields. +The line was: + +%s + +If you are running rgGRR through EPMP, this is usually a +sign that you are using an old version of the map file. +You can correct the problem by re-running Subject QC. If +you have already tried this, please contact the developers, +or file a bug. +""" + +INT_TO_GCODE = { + 0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)), + 4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)), + 8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)), + 12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)), + 16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)), + 20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)), + 24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)), + 28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)), + 32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)), + 36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)), + 40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)), + 44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)), + 48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)), + 52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)), + 56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)), + 60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)), + 64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)), + 68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)), + 72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)), + 76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)), + 80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)), + 84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)), + 88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)), + 92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)), + 96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)), + 100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)), + 104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)), + 108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)), + 112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)), + 116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)), + 120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)), + 124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)), + 128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)), + 132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)), + 136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)), + 140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)), + 144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)), + 148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)), + 152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)), + 156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)), + 160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)), + 164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)), + 168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)), + 172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)), + 176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)), + 180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)), + 184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)), + 188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)), + 192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)), + 196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)), + 200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)), + 204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)), + 208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)), + 212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)), + 216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)), + 220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)), + 224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)), + 228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)), + 232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)), + 236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)), + 240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)), + 244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)), + 248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)), + 252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)), + } + +GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()]) + +### Exceptions +class DuplicateMarkerInMapFile(Exception): pass +class MapLineTooShort(Exception): pass +class ThirdAllele(Exception): pass +class PedError(Exception): pass +class BadMagic(Exception): + """ Raised when one of the MAGIC bytes in a bed file does not match + """ + pass +class BedError(Exception): + """ Raised when parsing a bed file runs into problems + """ + pass +class UnknownGenocode(Exception): + """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?) + """ + pass +class UnknownGeno(Exception): pass + +### Utility functions + +def timenow(): + """return current time as a string + """ + return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) + +def ceiling(n, k): + ''' Return the least multiple of k which is greater than n + ''' + m = n % k + if m == 0: + return n + else: + return n + k - m + +def nbytes(n): + ''' Return the number of bytes required for n subjects + ''' + return 2*ceiling(n, 4)/8 + +### Primary module functionality +class LPed: + """ The uber-class for processing the Linkage-format *.ped/*.map files + """ + def __init__(self, base): + self.base = base + self._ped = Ped('%s.ped' % (self.base)) + self._map = Map('%s.map' % (self.base)) + + self._markers = {} + self._ordered_markers = [] + self._marker_allele_lookup = {} + self._autosomal_indices = set() + + self._subjects = {} + self._ordered_subjects = [] + + self._genotypes = [] + + def parse(self): + """ + """ + if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow()) + self._map.parse() + self._markers = self._map._markers + self._ordered_markers = self._map._ordered_markers + self._autosomal_indices = self._map._autosomal_indices + + self._ped.parse(self._ordered_markers) + self._subjects = self._ped._subjects + self._ordered_subjects = self._ped._ordered_subjects + self._genotypes = self._ped._genotypes + self._marker_allele_lookup = self._ped._marker_allele_lookup + + ### Adjust self._markers based on the allele information + ### we got from parsing the ped file + for m, name in enumerate(self._ordered_markers): + a1, a2 = self._marker_allele_lookup[m][HET] + self._markers[name][-2] = a1 + self._markers[name][-1] = a2 + if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow()) + + def getSubjectInfo(self, fid, oiid): + """ + """ + return self._subject_info[(fid, oiid)] + + def getSubjectInfoByLine(self, line): + """ + """ + return self._subject_info[self._ordered_subjects[line]] + + def getGenotypesByIndices(self, s, mlist, format): + """ needed for grr if lped - deprecated but.. + """ + mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ? + raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)]) + if format == 'raw': + return raw_array + elif format == 'ref': + result = array('i', [0]*len(mlist)) + for m, gcode in enumerate(raw_array): + if gcode == HOM0: + nref = 3 + elif gcode == HET: + nref = 2 + elif gcode == HOM1: + nref = 1 + else: + nref = 0 + result[m] = nref + return result + else: + result = [] + for m, gcode in enumerate(raw_array): + result.append(self._marker_allele_lookup[m][gcode]) + return result + + def writebed(self, base): + """ + """ + dst_name = '%s.fam' % (base) + print 'Writing pedigree information to [ %s ]' % (dst_name) + dst = open(dst_name, 'w') + for skey in self._ordered_subjects: + (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey] + dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe)) + dst.close() + + dst_name = '%s.bim' % (base) + print 'Writing map (extended format) information to [ %s ]' % (dst_name) + dst = open(dst_name, 'w') + for m, marker in enumerate(self._ordered_markers): + chrom, name, genpos, abspos, a1, a2 = self._markers[marker] + dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2)) + dst.close() + + bed_name = '%s.bed' % (base) + print 'Writing genotype bitfile to [ %s ]' % (bed_name) + print 'Using (default) SNP-major mode' + bed = open(bed_name, 'w') + + ### Write the 3 header bytes + bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2))) + bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2))) + bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2))) + + ### Calculate how many "pad bits" we should add after the last subject + nsubjects = len(self._ordered_subjects) + nmarkers = len(self._ordered_markers) + total_bytes = nbytes(nsubjects) + nbits = nsubjects * 2 + pad_nibbles = ((total_bytes * 8) - nbits)/2 + pad = array('i', [0]*pad_nibbles) + + ### And now write genotypes to the file + for m in xrange(nmarkers): + geno = self._genotypes[m] + geno.extend(pad) + bytes = len(geno)/4 + for b in range(bytes): + idx = b*4 + gcode = tuple(geno[idx:idx+4]) + try: + byte = struct.pack('B', GCODE_TO_INT[gcode]) + except KeyError: + print m, b, gcode + raise + bed.write(byte) + bed.close() + + def autosomal_indices(self): + """ Return the indices of markers in this ped/map that are autosomal. + This is used by rgGRR so that it can select a random set of markers + from the autosomes (sex chroms screw up the plot) + """ + return self._autosomal_indices + +class Ped: + def __init__(self, path): + self.path = path + self._subjects = {} + self._ordered_subjects = [] + self._genotypes = [] + self._marker_allele_lookup = {} + + def lineCount(self,infile): + """ count the number of lines in a file - efficiently using wget + """ + return int(commands.getoutput('wc -l %s' % (infile)).split()[0]) + + + def parse(self, markers): + """ Parse a given file -- this needs to be memory-efficient so that large + files can be parsed (~1 million markers on ~5000 subjects?). It + should also be fast, if possible. + """ + + ### Find out how many lines are in the file so we can ... + nsubjects = self.lineCount(self.path) + ### ... Pre-allocate the genotype arrays + nmarkers = len(markers) + _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)] + self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)] + + if self.path.endswith('.gz'): + pfile = gzip.open(self.path, 'r') + else: + pfile = open(self.path, 'r') + + for s, line in enumerate(pfile): + line = line.strip() + if not line: + continue + + fid, iid, did, mid, sex, phe, genos = line.split(None, 6) + sid = iid.split('.')[0] + d_sid = did.split('.')[0] + m_sid = mid.split('.')[0] + + skey = (fid, iid) + self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) + self._ordered_subjects.append(skey) + + genotypes = genos.split() + + for m, marker in enumerate(markers): + idx = m*2 + a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m + s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m + + ### FIXME: I think this can still be faster, and simpler to read + # Two pieces of logic intertwined here: first, we need to code + # this genotype as HOM0, HOM1, HET or MISS. Second, we need to + # keep an ongoing record of the genotypes seen for this marker + if a1 == a2: + if a1 in MISSING_ALLELES: + geno = MISS_GENO + else: + if s1 == '0': + seen[0] = a1 + elif s1 == a1 or s2 == a2: + pass + elif s2 == '0': + seen[1] = a1 + else: + raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen))) + + if a1 == seen[0]: + geno = HOM0_GENO + elif a1 == seen[1]: + geno = HOM1_GENO + else: + raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen))) + elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES: + geno = MISS_GENO + else: + geno = HET_GENO + if s1 == '0': + seen[0] = a1 + seen[1] = a2 + elif s2 == '0': + if s1 == a1: + seen[1] = a2 + elif s1 == a2: + seen[1] = a1 + else: + raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen))) + else: + if sorted(seen) != sorted((a1, a2)): + raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen))) + + gcode = GENO_TO_GCODE.get(geno, None) + if gcode is None: + raise UnknownGeno(str(geno)) + self._genotypes[m][s] = gcode + + # Build the _marker_allele_lookup table + for m, alleles in enumerate(_marker_alleles): + if len(alleles) == 2: + a1, a2 = alleles + elif len(alleles) == 1: + a1 = alleles[0] + a2 = '0' + else: + print 'All alleles blank for %s: %s' % (m, str(alleles)) + raise + + self._marker_allele_lookup[m] = { + HOM0: (a2, a2), + HOM1: (a1, a1), + HET : (a1, a2), + MISS: ('0','0'), + } + + if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path) + +class Map: + def __init__(self, path=None): + self.path = path + self._markers = {} + self._ordered_markers = [] + self._autosomal_indices = set() + + def __len__(self): + return len(self._markers) + + def parse(self): + """ Parse a Linkage-format map file + """ + if self.path.endswith('.gz'): + fh = gzip.open(self.path, 'r') + else: + fh = open(self.path, 'r') + + for i, line in enumerate(fh): + line = line.strip() + if not line: + continue + + fields = line.split() + if len(fields) < 4: + raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields))) + else: + chrom, name, genpos, abspos = fields + if name in self._markers: + raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path)) + abspos = int(abspos) + if abspos < 0: + continue + if chrom in AUTOSOMES: + self._autosomal_indices.add(i) + chrom = CHROM_REPLACE.get(chrom, chrom) + self._markers[name] = [chrom, name, genpos, abspos, None, None] + self._ordered_markers.append(name) + fh.close() + if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path) + +class BPed: + """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam + """ + def __init__(self, base): + self.base = base + self._bed = Bed('%s.bed' % (self.base)) + self._bim = Bim('%s.bim' % (self.base)) + self._fam = Fam('%s.fam' % (self.base)) + + self._markers = {} + self._ordered_markers = [] + self._marker_allele_lookup = {} + self._autosomal_indices = set() + + self._subjects = {} + self._ordered_subjects = [] + + self._genotypes = [] + + def parse(self, quick=False): + """ + """ + self._quick = quick + + self._bim.parse() + self._markers = self._bim._markers + self._ordered_markers = self._bim._ordered_markers + self._marker_allele_lookup = self._bim._marker_allele_lookup + self._autosomal_indices = self._bim._autosomal_indices + + self._fam.parse() + self._subjects = self._fam._subjects + self._ordered_subjects = self._fam._ordered_subjects + + self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick) + self._bedf = self._bed._fh + self._genotypes = self._bed._genotypes + self.nsubjects = len(self._ordered_subjects) + self.nmarkers = len(self._ordered_markers) + self._bytes_per_marker = nbytes(self.nsubjects) + + def writeped(self, path=None): + """ + """ + path = self.path = path or self.path + + map_name = self.path.replace('.bed', '.map') + print 'Writing map file [ %s ]' % (map_name) + dst = open(map_name, 'w') + for m in self._ordered_markers: + chrom, snp, genpos, abspos, a1, a2 = self._markers[m] + dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos)) + dst.close() + + ped_name = self.path.replace('.bed', '.ped') + print 'Writing ped file [ %s ]' % (ped_name) + ped = open(ped_name, 'w') + firstyikes = False + for s, skey in enumerate(self._ordered_subjects): + idx = s*2 + (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey] + ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe)) + genotypes_for_subject = self.getGenotypesForSubject(s) + for m, snp in enumerate(self._ordered_markers): + #a1, a2 = self.getGenotypeByIndices(s, m) + a1,a2 = genotypes_for_subject[m] + ped.write(' %s %s' % (a1, a2)) + ped.write('\n') + ped.close() + + def getGenotype(self, subject, marker): + """ Retrieve a genotype for a particular subject/marker pair + """ + m = self._ordered_markers.index(marker) + s = self._ordered_subjects.index(subject) + return self.getGenotypeByIndices(s, m) + + def getGenotypesForSubject(self, s, raw=False): + """ Returns list of genotypes for all m markers + for subject s. If raw==True, then an array + of raw integer gcodes is returned instead + """ + if self._quick: + nmarkers = len(self._markers) + raw_array = array('i', [0]*nmarkers) + seek_nibble = s % 4 + for m in xrange(nmarkers): + seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH + self._bedf.seek(seek_byte) + geno = struct.unpack('B', self._bedf.read(1))[0] + quartet = INT_TO_GCODE[geno] + gcode = quartet[seek_nibble] + raw_array[m] = gcode + else: + raw_array = array('i', [row[s] for row in self._genotypes]) + + if raw: + return raw_array + else: + result = [] + for m, gcode in enumerate(raw_array): + result.append(self._marker_allele_lookup[m][gcode]) + return result + + def getGenotypeByIndices(self, s, m): + """ + """ + if self._quick: + # Determine which byte we need to seek to, and + # which nibble within the byte we need + seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH + seek_nibble = s % 4 + self._bedf.seek(seek_byte) + geno = struct.unpack('B', self._bedf.read(1))[0] + quartet = INT_TO_GCODE[geno] + gcode = quartet[seek_nibble] + else: + # Otherwise, just grab the genotypes from the + # list of arrays + genos_for_marker = self._genotypes[m] + gcode = genos_for_marker[s] + + return self._marker_allele_lookup[m][gcode] + + def getGenotypesByIndices(self, s, mlist, format): + """ + """ + if self._quick: + raw_array = array('i', [0]*len(mlist)) + seek_nibble = s % 4 + for i,m in enumerate(mlist): + seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH + self._bedf.seek(seek_byte) + geno = struct.unpack('B', self._bedf.read(1))[0] + quartet = INT_TO_GCODE[geno] + gcode = quartet[seek_nibble] + raw_array[i] = gcode + mlist = set(mlist) + else: + mlist = set(mlist) + raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist]) + + if format == 'raw': + return raw_array + elif format == 'ref': + result = array('i', [0]*len(mlist)) + for m, gcode in enumerate(raw_array): + if gcode == HOM0: + nref = 3 + elif gcode == HET: + nref = 2 + elif gcode == HOM1: + nref = 1 + else: + nref = 0 + result[m] = nref + return result + else: + result = [] + for m, gcode in enumerate(raw_array): + result.append(self._marker_allele_lookup[m][gcode]) + return result + + def getSubject(self, s): + """ + """ + skey = self._ordered_subjects[s] + return self._subjects[skey] + + def autosomal_indices(self): + """ Return the indices of markers in this ped/map that are autosomal. + This is used by rgGRR so that it can select a random set of markers + from the autosomes (sex chroms screw up the plot) + """ + return self._autosomal_indices + +class Bed: + + def __init__(self, path): + self.path = path + self._genotypes = [] + self._fh = None + + def parse(self, subjects, markers, quick=False): + """ Parse the bed file, indicated either by the path parameter, + or as the self.path indicated in __init__. If quick is + True, then just parse the bim and fam, then genotypes will + be looked up dynamically by indices + """ + self._quick = quick + + ordered_markers = markers + ordered_subjects = subjects + nsubjects = len(ordered_subjects) + nmarkers = len(ordered_markers) + + bed = open(self.path, 'rb') + self._fh = bed + + byte1 = bed.read(1) + byte2 = bed.read(1) + byte3 = bed.read(1) + format_flag = struct.unpack('B', byte3)[0] + + h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]]) + h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]]) + h3 = tuple(INT_TO_GCODE[format_flag]) + + if h1 != MAGIC1 or h2 != MAGIC2: + raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2)) + if format_flag: + print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3) + else: + raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3) + + print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects) + + ### If quick mode was specified, we're done ... + self._quick = quick + if quick: + return + + ### ... Otherwise, parse genotypes into an array, and append that + ### array to self._genotypes + ngcodes = ceiling(nsubjects, 4) + bytes_per_marker = nbytes(nsubjects) + for m in xrange(nmarkers): + genotype_array = array('i', [-1]*(ngcodes)) + for byte in xrange(bytes_per_marker): + intval = struct.unpack('B', bed.read(1))[0] + idx = byte*4 + genotype_array[idx:idx+4] = INT_TO_GCODE[intval] + self._genotypes.append(genotype_array) + +class Bim: + def __init__(self, path): + """ + """ + self.path = path + self._markers = {} + self._ordered_markers = [] + self._marker_allele_lookup = {} + self._autosomal_indices = set() + + def parse(self): + """ + """ + print 'Reading map (extended format) from [ %s ]' % (self.path) + bim = open(self.path, 'r') + for m, line in enumerate(bim): + chrom, snp, gpos, apos, a1, a2 = line.strip().split() + self._markers[snp] = (chrom, snp, gpos, apos, a1, a2) + self._marker_allele_lookup[m] = { + HOM0: (a2, a2), + HOM1: (a1, a1), + HET : (a1, a2), + MISS: ('0','0'), + } + self._ordered_markers.append(snp) + if chrom in AUTOSOMES: + self._autosomal_indices.add(m) + bim.close() + print '%s markers to be included from [ %s ]' % (m+1, self.path) + +class Fam: + def __init__(self, path): + """ + """ + self.path = path + self._subjects = {} + self._ordered_subjects = [] + + def parse(self): + """ + """ + print 'Reading pedigree information from [ %s ]' % (self.path) + fam = open(self.path, 'r') + for s, line in enumerate(fam): + fid, iid, did, mid, sex, phe = line.strip().split() + sid = iid.split('.')[0] + d_sid = did.split('.')[0] + m_sid = mid.split('.')[0] + skey = (fid, iid) + self._ordered_subjects.append(skey) + self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) + fam.close() + print '%s individuals read from [ %s ]' % (s+1, self.path) + +### Command-line functionality and testing +def test(arg): + ''' + ''' + + import time + + if arg == 'CAMP_AFFY.ped': + print 'Testing bed.parse(quick=True)' + s = time.time() + bed = Bed(arg.replace('.ped', '.bed')) + bed.parse(quick=True) + print bed.getGenotype(('400118', '10300283'), 'rs2000467') + print bed.getGenotype(('400118', '10101384'), 'rs2294019') + print bed.getGenotype(('400121', '10101149'), 'rs2294019') + print bed.getGenotype(('400123', '10200290'), 'rs2294019') + assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4') + e = time.time() + print 'e-s = %s\n' % (e-s) + + print 'Testing bed.parse' + s = time.time() + bed = BPed(arg) + bed.parse(quick=False) + e = time.time() + print 'e-s = %s\n' % (e-s) + + print 'Testing bed.writeped' + s = time.time() + outname = '%s_BEDTEST' % (arg) + bed.writeped(outname) + e = time.time() + print 'e-s = %s\n' % (e-s) + del(bed) + + print 'Testing ped.parse' + s = time.time() + ped = LPed(arg) + ped.parse() + e = time.time() + print 'e-s = %s\n' % (e-s) + + print 'Testing ped.writebed' + s = time.time() + outname = '%s_PEDTEST' % (arg) + ped.writebed(outname) + e = time.time() + print 'e-s = %s\n' % (e-s) + del(ped) + +def profile_bed(arg): + """ + """ + bed = BPed(arg) + bed.parse(quick=False) + outname = '%s_BEDPROFILE' % (arg) + bed.writeped(outname) + +def profile_ped(arg): + """ + """ + ped = LPed(arg) + ped.parse() + outname = '%s_PEDPROFILE' % (arg) + ped.writebed(outname) + +if __name__ == '__main__': + """ Run as a command-line, this script should get one or more arguments, + each one a ped file to be parsed with the PedParser (unit tests?) + """ + op = optparse.OptionParser() + op.add_option('--profile-bed', action='store_true', default=False) + op.add_option('--profile-ped', action='store_true', default=False) + opts, args = op.parse_args() + + if opts.profile_bed: + import profile + import pstats + profile.run('profile_bed(args[0])', 'fooprof') + p = pstats.Stats('fooprof') + p.sort_stats('cumulative').print_stats(10) + elif opts.profile_ped: + import profile + import pstats + profile.run('profile_ped(args[0])', 'fooprof') + p = pstats.Stats('fooprof') + p.sort_stats('cumulative').print_stats(10) + else: + for arg in args: + test(arg) + + ### Code used to generate the INT_TO_GCODE dictionary + #print '{\n ', + #for i in range(256): + # b = INT2BIN[i] + # ints = [] + # s = str(i).rjust(3) + # #print b + # for j in range(4): + # idx = j*2 + # #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2) + # ints.append(int(b[idx:idx+2], 2)) + # print '%s: array(\'i\', %s),' % (s,tuple(ints)), + # if i > 0 and (i+1) % 4 == 0: + # print '\n ', + #print '}' + +