Mercurial > repos > xuebing > sharplabtool

#!/usr/bin/env python2.4
"""
"""

import optparse,os,subprocess,gzip,struct,time,commands
from array import array

#from AIMS import util
#from pga import util as pgautil

__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'

VERBOSE = True

MISSING_ALLELES = set(['N', '0', '.', '-',''])

AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])

MAGIC_BYTE1 = '00110110'
MAGIC_BYTE2 = '11011000'
FORMAT_SNP_MAJOR_BYTE = '10000000'
FORMAT_IND_MAJOR_BYTE = '00000000'
MAGIC1 = (0, 3, 1, 2)
MAGIC2 = (3, 1, 2, 0)
FORMAT_SNP_MAJOR = (2, 0, 0, 0)
FORMAT_IND_MAJOR = (0, 0, 0, 0)
HEADER_LENGTH = 3

HOM0 = 3
HOM1 = 0
MISS = 2
HET  = 1
HOM0_GENO = (0, 0)
HOM1_GENO = (1, 1)
HET_GENO = (0, 1)
MISS_GENO = (-9, -9)

GENO_TO_GCODE = {
    HOM0_GENO: HOM0,
    HET_GENO: HET,
    HOM1_GENO: HOM1,
    MISS_GENO: MISS,
    }

CHROM_REPLACE = {
    'X': '23',
    'Y': '24',
    'XY': '25',
    'MT': '26',
    'M': '26',
}

MAP_LINE_EXCEPTION_TEXT = """
One or more lines in the *.map file has only three fields.
The line was:

%s

If you are running rgGRR through EPMP, this is usually a
sign that you are using an old version of the map file.
You can correct the problem by re-running Subject QC.  If
you have already tried this, please contact the developers,
or file a bug.
"""

INT_TO_GCODE = {
     0: array('i', (0, 0, 0, 0)),   1: array('i', (2, 0, 0, 0)),   2: array('i', (1, 0, 0, 0)),   3: array('i', (3, 0, 0, 0)),
     4: array('i', (0, 2, 0, 0)),   5: array('i', (2, 2, 0, 0)),   6: array('i', (1, 2, 0, 0)),   7: array('i', (3, 2, 0, 0)),
     8: array('i', (0, 1, 0, 0)),   9: array('i', (2, 1, 0, 0)),  10: array('i', (1, 1, 0, 0)),  11: array('i', (3, 1, 0, 0)),
    12: array('i', (0, 3, 0, 0)),  13: array('i', (2, 3, 0, 0)),  14: array('i', (1, 3, 0, 0)),  15: array('i', (3, 3, 0, 0)),
    16: array('i', (0, 0, 2, 0)),  17: array('i', (2, 0, 2, 0)),  18: array('i', (1, 0, 2, 0)),  19: array('i', (3, 0, 2, 0)),
    20: array('i', (0, 2, 2, 0)),  21: array('i', (2, 2, 2, 0)),  22: array('i', (1, 2, 2, 0)),  23: array('i', (3, 2, 2, 0)),
    24: array('i', (0, 1, 2, 0)),  25: array('i', (2, 1, 2, 0)),  26: array('i', (1, 1, 2, 0)),  27: array('i', (3, 1, 2, 0)),
    28: array('i', (0, 3, 2, 0)),  29: array('i', (2, 3, 2, 0)),  30: array('i', (1, 3, 2, 0)),  31: array('i', (3, 3, 2, 0)),
    32: array('i', (0, 0, 1, 0)),  33: array('i', (2, 0, 1, 0)),  34: array('i', (1, 0, 1, 0)),  35: array('i', (3, 0, 1, 0)),
    36: array('i', (0, 2, 1, 0)),  37: array('i', (2, 2, 1, 0)),  38: array('i', (1, 2, 1, 0)),  39: array('i', (3, 2, 1, 0)),
    40: array('i', (0, 1, 1, 0)),  41: array('i', (2, 1, 1, 0)),  42: array('i', (1, 1, 1, 0)),  43: array('i', (3, 1, 1, 0)),
    44: array('i', (0, 3, 1, 0)),  45: array('i', (2, 3, 1, 0)),  46: array('i', (1, 3, 1, 0)),  47: array('i', (3, 3, 1, 0)),
    48: array('i', (0, 0, 3, 0)),  49: array('i', (2, 0, 3, 0)),  50: array('i', (1, 0, 3, 0)),  51: array('i', (3, 0, 3, 0)),
    52: array('i', (0, 2, 3, 0)),  53: array('i', (2, 2, 3, 0)),  54: array('i', (1, 2, 3, 0)),  55: array('i', (3, 2, 3, 0)),
    56: array('i', (0, 1, 3, 0)),  57: array('i', (2, 1, 3, 0)),  58: array('i', (1, 1, 3, 0)),  59: array('i', (3, 1, 3, 0)),
    60: array('i', (0, 3, 3, 0)),  61: array('i', (2, 3, 3, 0)),  62: array('i', (1, 3, 3, 0)),  63: array('i', (3, 3, 3, 0)),
    64: array('i', (0, 0, 0, 2)),  65: array('i', (2, 0, 0, 2)),  66: array('i', (1, 0, 0, 2)),  67: array('i', (3, 0, 0, 2)),
    68: array('i', (0, 2, 0, 2)),  69: array('i', (2, 2, 0, 2)),  70: array('i', (1, 2, 0, 2)),  71: array('i', (3, 2, 0, 2)),
    72: array('i', (0, 1, 0, 2)),  73: array('i', (2, 1, 0, 2)),  74: array('i', (1, 1, 0, 2)),  75: array('i', (3, 1, 0, 2)),
    76: array('i', (0, 3, 0, 2)),  77: array('i', (2, 3, 0, 2)),  78: array('i', (1, 3, 0, 2)),  79: array('i', (3, 3, 0, 2)),
    80: array('i', (0, 0, 2, 2)),  81: array('i', (2, 0, 2, 2)),  82: array('i', (1, 0, 2, 2)),  83: array('i', (3, 0, 2, 2)),
    84: array('i', (0, 2, 2, 2)),  85: array('i', (2, 2, 2, 2)),  86: array('i', (1, 2, 2, 2)),  87: array('i', (3, 2, 2, 2)),
    88: array('i', (0, 1, 2, 2)),  89: array('i', (2, 1, 2, 2)),  90: array('i', (1, 1, 2, 2)),  91: array('i', (3, 1, 2, 2)),
    92: array('i', (0, 3, 2, 2)),  93: array('i', (2, 3, 2, 2)),  94: array('i', (1, 3, 2, 2)),  95: array('i', (3, 3, 2, 2)),
    96: array('i', (0, 0, 1, 2)),  97: array('i', (2, 0, 1, 2)),  98: array('i', (1, 0, 1, 2)),  99: array('i', (3, 0, 1, 2)),
   100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)),
   104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)),
   108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)),
   112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)),
   116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)),
   120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)),
   124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)),
   128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)),
   132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)),
   136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)),
   140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)),
   144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)),
   148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)),
   152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)),
   156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)),
   160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)),
   164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)),
   168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)),
   172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)),
   176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)),
   180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)),
   184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)),
   188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)),
   192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)),
   196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)),
   200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)),
   204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)),
   208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)),
   212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)),
   216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)),
   220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)),
   224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)),
   228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)),
   232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)),
   236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)),
   240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)),
   244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)),
   248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)),
   252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)),
   }

GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])

### Exceptions
class DuplicateMarkerInMapFile(Exception): pass
class MapLineTooShort(Exception): pass
class ThirdAllele(Exception): pass
class PedError(Exception): pass
class BadMagic(Exception):
    """ Raised when one of the MAGIC bytes in a bed file does not match
    """
    pass
class BedError(Exception):
    """ Raised when parsing a bed file runs into problems
    """
    pass
class UnknownGenocode(Exception):
    """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
    """
    pass
class UnknownGeno(Exception): pass

### Utility functions

def timenow():
    """return current time as a string
    """
    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))

def ceiling(n, k):
    ''' Return the least multiple of k which is greater than n
    '''
    m = n % k
    if m == 0:
        return n
    else:
        return n + k - m

def nbytes(n):
    ''' Return the number of bytes required for n subjects
    '''
    return 2*ceiling(n, 4)/8

### Primary module functionality
class LPed:
    """ The uber-class for processing the Linkage-format *.ped/*.map files
    """
    def __init__(self,  base):
        self.base = base
        self._ped = Ped('%s.ped' % (self.base))
        self._map = Map('%s.map' % (self.base))

        self._markers = {}
        self._ordered_markers = []
        self._marker_allele_lookup = {}
        self._autosomal_indices = set()

        self._subjects = {}
        self._ordered_subjects = []

        self._genotypes = []

    def parse(self):
        """
        """
        if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
        self._map.parse()
        self._markers = self._map._markers
        self._ordered_markers = self._map._ordered_markers
        self._autosomal_indices = self._map._autosomal_indices

        self._ped.parse(self._ordered_markers)
        self._subjects = self._ped._subjects
        self._ordered_subjects = self._ped._ordered_subjects
        self._genotypes = self._ped._genotypes
        self._marker_allele_lookup = self._ped._marker_allele_lookup

        ### Adjust self._markers based on the allele information
        ### we got from parsing the ped file
        for m,  name in enumerate(self._ordered_markers):
            a1,  a2 = self._marker_allele_lookup[m][HET]
            self._markers[name][-2] = a1
            self._markers[name][-1] = a2
        if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())

    def getSubjectInfo(self, fid, oiid):
        """
        """
        return self._subject_info[(fid, oiid)]

    def getSubjectInfoByLine(self, line):
        """
        """
        return self._subject_info[self._ordered_subjects[line]]

    def getGenotypesByIndices(self, s, mlist, format):
        """ needed for grr if lped - deprecated but..
        """
        mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
        raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])
        if format == 'raw':
            return raw_array
        elif format == 'ref':
            result = array('i', [0]*len(mlist))
            for m, gcode in enumerate(raw_array):
                if gcode == HOM0:
                    nref = 3
                elif gcode == HET:
                    nref = 2
                elif gcode == HOM1:
                    nref = 1
                else:
                    nref = 0
                result[m] = nref
            return result
        else:
            result = []
            for m, gcode in enumerate(raw_array):
                result.append(self._marker_allele_lookup[m][gcode])
            return result

    def writebed(self, base):
        """
        """
        dst_name = '%s.fam' % (base)
        print 'Writing pedigree information to [ %s ]' % (dst_name)
        dst = open(dst_name, 'w')
        for skey in self._ordered_subjects:
            (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
            dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
        dst.close()

        dst_name = '%s.bim' % (base)
        print 'Writing map (extended format) information to [ %s ]' % (dst_name)
        dst = open(dst_name, 'w')
        for m, marker in enumerate(self._ordered_markers):
            chrom, name, genpos, abspos,  a1,  a2 = self._markers[marker]
            dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
        dst.close()

        bed_name = '%s.bed' % (base)
        print 'Writing genotype bitfile to [ %s ]' % (bed_name)
        print 'Using (default) SNP-major mode'
        bed = open(bed_name, 'w')

        ### Write the 3 header bytes
        bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
        bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
        bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))

        ### Calculate how many "pad bits" we should add after the last subject
        nsubjects = len(self._ordered_subjects)
        nmarkers = len(self._ordered_markers)
        total_bytes = nbytes(nsubjects)
        nbits = nsubjects  * 2
        pad_nibbles = ((total_bytes * 8) - nbits)/2
        pad = array('i', [0]*pad_nibbles)

        ### And now write genotypes to the file
        for m in xrange(nmarkers):
            geno = self._genotypes[m]
            geno.extend(pad)
            bytes = len(geno)/4
            for b in range(bytes):
                idx = b*4
                gcode = tuple(geno[idx:idx+4])
                try:
                    byte = struct.pack('B', GCODE_TO_INT[gcode])
                except KeyError:
                    print m, b, gcode
                    raise
                bed.write(byte)
        bed.close()

    def autosomal_indices(self):
        """ Return the indices of markers in this ped/map that are autosomal.
            This is used by rgGRR so that it can select a random set of markers
            from the autosomes (sex chroms screw up the plot)
        """
        return self._autosomal_indices

class Ped:
    def __init__(self, path):
        self.path = path
        self._subjects = {}
        self._ordered_subjects = []
        self._genotypes = []
        self._marker_allele_lookup = {}

    def lineCount(self,infile):
        """ count the number of lines in a file - efficiently using wget
        """
        return int(commands.getoutput('wc -l %s' % (infile)).split()[0])


    def parse(self,  markers):
        """ Parse a given file -- this needs to be memory-efficient so that large
            files can be parsed (~1 million markers on ~5000 subjects?).  It
            should also be fast, if possible.
        """

        ### Find out how many lines are in the file so we can ...
        nsubjects = self.lineCount(self.path)
        ### ... Pre-allocate the genotype arrays
        nmarkers = len(markers)
        _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
        self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]

        if self.path.endswith('.gz'):
            pfile = gzip.open(self.path, 'r')
        else:
            pfile = open(self.path, 'r')

        for s, line in enumerate(pfile):
            line = line.strip()
            if not line:
                continue

            fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
            sid = iid.split('.')[0]
            d_sid = did.split('.')[0]
            m_sid = mid.split('.')[0]

            skey = (fid, iid)
            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
            self._ordered_subjects.append(skey)

            genotypes = genos.split()

            for m, marker in enumerate(markers):
                idx = m*2
                a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
                s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m

                ### FIXME: I think this can still be faster, and simpler to read
                # Two pieces of logic intertwined here:  first, we need to code
                # this genotype as HOM0, HOM1, HET or MISS.  Second, we need to
                # keep an ongoing record of the genotypes seen for this marker
                if a1 == a2:
                    if a1 in MISSING_ALLELES:
                        geno = MISS_GENO
                    else:
                        if s1 == '0':
                            seen[0] = a1
                        elif s1 == a1 or s2 == a2:
                            pass
                        elif s2 == '0':
                            seen[1] = a1
                        else:
                            raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))

                        if a1 == seen[0]:
                            geno = HOM0_GENO
                        elif a1 == seen[1]:
                            geno = HOM1_GENO
                        else:
                            raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
                elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
                    geno = MISS_GENO
                else:
                    geno = HET_GENO
                    if s1 == '0':
                        seen[0] = a1
                        seen[1] = a2
                    elif s2 == '0':
                        if s1 == a1:
                            seen[1] = a2
                        elif s1 == a2:
                            seen[1] = a1
                        else:
                            raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
                    else:
                        if sorted(seen) != sorted((a1, a2)):
                            raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))

                gcode = GENO_TO_GCODE.get(geno, None)
                if gcode is None:
                    raise UnknownGeno(str(geno))
                self._genotypes[m][s] = gcode

        # Build the _marker_allele_lookup table
        for m,  alleles in enumerate(_marker_alleles):
            if len(alleles) == 2:
                a1,  a2 = alleles
            elif len(alleles) == 1:
                a1 = alleles[0]
                a2 = '0'
            else:
                print 'All alleles blank for %s: %s' % (m,  str(alleles))
                raise

            self._marker_allele_lookup[m] = {
                HOM0: (a2, a2),
                HOM1: (a1, a1),
                HET : (a1, a2),
                MISS: ('0','0'),
                }

        if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects),  nsubjects,  self.path)

class Map:
    def __init__(self, path=None):
        self.path = path
        self._markers = {}
        self._ordered_markers = []
        self._autosomal_indices = set()

    def __len__(self):
        return len(self._markers)

    def parse(self):
        """ Parse a Linkage-format map file
        """
        if self.path.endswith('.gz'):
            fh = gzip.open(self.path, 'r')
        else:
            fh = open(self.path, 'r')

        for i, line in enumerate(fh):
            line = line.strip()
            if not line:
                continue

            fields = line.split()
            if len(fields) < 4:
                raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line),  len(fields)))
            else:
                chrom, name, genpos, abspos = fields
            if name in self._markers:
                raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
            abspos = int(abspos)
            if abspos < 0:
                continue
            if chrom in AUTOSOMES:
                self._autosomal_indices.add(i)
            chrom = CHROM_REPLACE.get(chrom, chrom)
            self._markers[name] = [chrom, name, genpos, abspos,  None,  None]
            self._ordered_markers.append(name)
        fh.close()
        if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers),  i,  self.path)

class BPed:
    """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam
    """
    def __init__(self,  base):
        self.base = base
        self._bed = Bed('%s.bed' % (self.base))
        self._bim = Bim('%s.bim' % (self.base))
        self._fam = Fam('%s.fam' % (self.base))

        self._markers = {}
        self._ordered_markers = []
        self._marker_allele_lookup = {}
        self._autosomal_indices = set()

        self._subjects = {}
        self._ordered_subjects = []

        self._genotypes = []

    def parse(self,  quick=False):
        """
        """
        self._quick = quick

        self._bim.parse()
        self._markers = self._bim._markers
        self._ordered_markers = self._bim._ordered_markers
        self._marker_allele_lookup = self._bim._marker_allele_lookup
        self._autosomal_indices = self._bim._autosomal_indices

        self._fam.parse()
        self._subjects = self._fam._subjects
        self._ordered_subjects = self._fam._ordered_subjects

        self._bed.parse(self._ordered_subjects,  self._ordered_markers,  quick=quick)
        self._bedf = self._bed._fh
        self._genotypes = self._bed._genotypes
        self.nsubjects = len(self._ordered_subjects)
        self.nmarkers = len(self._ordered_markers)
        self._bytes_per_marker = nbytes(self.nsubjects)

    def writeped(self, path=None):
        """
        """
        path = self.path = path or self.path

        map_name = self.path.replace('.bed', '.map')
        print 'Writing map file [ %s ]' % (map_name)
        dst = open(map_name, 'w')
        for m in self._ordered_markers:
            chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
            dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
        dst.close()

        ped_name = self.path.replace('.bed', '.ped')
        print 'Writing ped file [ %s ]' % (ped_name)
        ped = open(ped_name, 'w')
        firstyikes = False
        for s, skey in enumerate(self._ordered_subjects):
            idx = s*2
            (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
            ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
            genotypes_for_subject = self.getGenotypesForSubject(s)
            for m, snp in enumerate(self._ordered_markers):
                #a1, a2 = self.getGenotypeByIndices(s, m)
                a1,a2 = genotypes_for_subject[m]
                ped.write(' %s %s' % (a1, a2))
            ped.write('\n')
        ped.close()

    def getGenotype(self, subject, marker):
        """ Retrieve a genotype for a particular subject/marker pair
        """
        m = self._ordered_markers.index(marker)
        s = self._ordered_subjects.index(subject)
        return self.getGenotypeByIndices(s, m)

    def getGenotypesForSubject(self, s, raw=False):
        """ Returns list of genotypes for all m markers
            for subject s.  If raw==True, then an array
            of raw integer gcodes is returned instead
        """
        if self._quick:
            nmarkers = len(self._markers)
            raw_array = array('i', [0]*nmarkers)
            seek_nibble = s % 4
            for m in xrange(nmarkers):
                seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
                self._bedf.seek(seek_byte)
                geno = struct.unpack('B', self._bedf.read(1))[0]
                quartet = INT_TO_GCODE[geno]
                gcode = quartet[seek_nibble]
                raw_array[m] = gcode
        else:
            raw_array = array('i', [row[s] for row in self._genotypes])

        if raw:
            return raw_array
        else:
            result = []
            for m, gcode in enumerate(raw_array):
                result.append(self._marker_allele_lookup[m][gcode])
            return result

    def getGenotypeByIndices(self, s, m):
        """
        """
        if self._quick:
            # Determine which byte we need to seek to, and
            # which nibble within the byte we need
            seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
            seek_nibble = s % 4
            self._bedf.seek(seek_byte)
            geno = struct.unpack('B', self._bedf.read(1))[0]
            quartet = INT_TO_GCODE[geno]
            gcode = quartet[seek_nibble]
        else:
            # Otherwise, just grab the genotypes from the
            # list of arrays
            genos_for_marker = self._genotypes[m]
            gcode = genos_for_marker[s]

        return self._marker_allele_lookup[m][gcode]

    def getGenotypesByIndices(self, s, mlist, format):
        """
        """
        if self._quick:
            raw_array = array('i', [0]*len(mlist))
            seek_nibble = s % 4
            for i,m in enumerate(mlist):
                seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
                self._bedf.seek(seek_byte)
                geno = struct.unpack('B', self._bedf.read(1))[0]
                quartet = INT_TO_GCODE[geno]
                gcode = quartet[seek_nibble]
                raw_array[i] = gcode
            mlist = set(mlist)
        else:
            mlist = set(mlist)
            raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])

        if format == 'raw':
            return raw_array
        elif format == 'ref':
            result = array('i', [0]*len(mlist))
            for m, gcode in enumerate(raw_array):
                if gcode == HOM0:
                    nref = 3
                elif gcode == HET:
                    nref = 2
                elif gcode == HOM1:
                    nref = 1
                else:
                    nref = 0
                result[m] = nref
            return result
        else:
            result = []
            for m, gcode in enumerate(raw_array):
                result.append(self._marker_allele_lookup[m][gcode])
            return result

    def getSubject(self, s):
        """
        """
        skey = self._ordered_subjects[s]
        return self._subjects[skey]

    def autosomal_indices(self):
        """ Return the indices of markers in this ped/map that are autosomal.
            This is used by rgGRR so that it can select a random set of markers
            from the autosomes (sex chroms screw up the plot)
        """
        return self._autosomal_indices

class Bed:

    def __init__(self, path):
        self.path = path
        self._genotypes = []
        self._fh = None

    def parse(self, subjects,  markers,  quick=False):
        """ Parse the bed file, indicated either by the path parameter,
            or as the self.path indicated in __init__.  If quick is
            True, then just parse the bim and fam, then genotypes will
            be looked up dynamically by indices
        """
        self._quick = quick

        ordered_markers = markers
        ordered_subjects = subjects
        nsubjects = len(ordered_subjects)
        nmarkers = len(ordered_markers)

        bed = open(self.path, 'rb')
        self._fh = bed

        byte1 = bed.read(1)
        byte2 = bed.read(1)
        byte3 = bed.read(1)
        format_flag = struct.unpack('B', byte3)[0]

        h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
        h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
        h3 = tuple(INT_TO_GCODE[format_flag])

        if h1 != MAGIC1 or h2 != MAGIC2:
            raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
        if format_flag:
            print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
        else:
            raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)

        print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)

        ### If quick mode was specified, we're done ...
        self._quick = quick
        if quick:
            return

        ### ... Otherwise, parse genotypes into an array, and append that
        ### array to self._genotypes
        ngcodes = ceiling(nsubjects, 4)
        bytes_per_marker = nbytes(nsubjects)
        for m in xrange(nmarkers):
            genotype_array = array('i', [-1]*(ngcodes))
            for byte in xrange(bytes_per_marker):
                intval = struct.unpack('B', bed.read(1))[0]
                idx = byte*4
                genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
            self._genotypes.append(genotype_array)

class Bim:
    def __init__(self, path):
        """
        """
        self.path = path
        self._markers = {}
        self._ordered_markers = []
        self._marker_allele_lookup = {}
        self._autosomal_indices = set()

    def parse(self):
        """
        """
        print 'Reading map (extended format) from [ %s ]' % (self.path)
        bim = open(self.path, 'r')
        for m, line in enumerate(bim):
            chrom, snp, gpos, apos, a1, a2 = line.strip().split()
            self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
            self._marker_allele_lookup[m] = {
                HOM0: (a2, a2),
                HOM1: (a1, a1),
                HET : (a1, a2),
                MISS: ('0','0'),
                }
            self._ordered_markers.append(snp)
            if chrom in AUTOSOMES:
                self._autosomal_indices.add(m)
        bim.close()
        print '%s markers to be included from [ %s ]' % (m+1, self.path)

class Fam:
    def __init__(self, path):
        """
        """
        self.path = path
        self._subjects = {}
        self._ordered_subjects = []

    def parse(self):
        """
        """
        print 'Reading pedigree information from [ %s ]' % (self.path)
        fam = open(self.path, 'r')
        for s, line in enumerate(fam):
            fid, iid, did, mid, sex, phe = line.strip().split()
            sid = iid.split('.')[0]
            d_sid = did.split('.')[0]
            m_sid = mid.split('.')[0]
            skey = (fid, iid)
            self._ordered_subjects.append(skey)
            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
        fam.close()
        print '%s individuals read from [ %s ]' % (s+1, self.path)

### Command-line functionality and testing
def test(arg):
    '''
    '''

    import time

    if arg == 'CAMP_AFFY.ped':
        print 'Testing bed.parse(quick=True)'
        s = time.time()
        bed = Bed(arg.replace('.ped', '.bed'))
        bed.parse(quick=True)
        print bed.getGenotype(('400118', '10300283'), 'rs2000467')
        print bed.getGenotype(('400118', '10101384'), 'rs2294019')
        print bed.getGenotype(('400121', '10101149'), 'rs2294019')
        print bed.getGenotype(('400123', '10200290'), 'rs2294019')
        assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
        e = time.time()
        print 'e-s = %s\n' % (e-s)

    print 'Testing bed.parse'
    s = time.time()
    bed = BPed(arg)
    bed.parse(quick=False)
    e = time.time()
    print 'e-s = %s\n' % (e-s)

    print 'Testing bed.writeped'
    s = time.time()
    outname = '%s_BEDTEST' % (arg)
    bed.writeped(outname)
    e = time.time()
    print 'e-s = %s\n' % (e-s)
    del(bed)

    print 'Testing ped.parse'
    s = time.time()
    ped = LPed(arg)
    ped.parse()
    e = time.time()
    print 'e-s = %s\n' % (e-s)

    print 'Testing ped.writebed'
    s = time.time()
    outname = '%s_PEDTEST' % (arg)
    ped.writebed(outname)
    e = time.time()
    print 'e-s = %s\n' % (e-s)
    del(ped)

def profile_bed(arg):
    """
    """
    bed = BPed(arg)
    bed.parse(quick=False)
    outname = '%s_BEDPROFILE' % (arg)
    bed.writeped(outname)

def profile_ped(arg):
    """
    """
    ped = LPed(arg)
    ped.parse()
    outname = '%s_PEDPROFILE' % (arg)
    ped.writebed(outname)

if __name__ == '__main__':
    """ Run as a command-line, this script should get one or more arguments,
        each one a ped file to be parsed with the PedParser (unit tests?)
    """
    op = optparse.OptionParser()
    op.add_option('--profile-bed', action='store_true', default=False)
    op.add_option('--profile-ped', action='store_true', default=False)
    opts, args = op.parse_args()

    if opts.profile_bed:
        import profile
        import pstats
        profile.run('profile_bed(args[0])', 'fooprof')
        p = pstats.Stats('fooprof')
        p.sort_stats('cumulative').print_stats(10)
    elif opts.profile_ped:
        import profile
        import pstats
        profile.run('profile_ped(args[0])', 'fooprof')
        p = pstats.Stats('fooprof')
        p.sort_stats('cumulative').print_stats(10)
    else:
        for arg in args:
            test(arg)

    ### Code used to generate the INT_TO_GCODE dictionary
    #print '{\n  ',
    #for i in range(256):
    #   b = INT2BIN[i]
    #    ints = []
    #    s = str(i).rjust(3)
    #    #print b
    #    for j in range(4):
    #        idx = j*2
    #        #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
    #        ints.append(int(b[idx:idx+2], 2))
    #    print '%s: array(\'i\', %s),' % (s,tuple(ints)),
    #    if i > 0 and (i+1) % 4 == 0:
    #        print '\n  ',
    #print '}'
author	xuebing
date	Fri, 09 Mar 2012 19:37:19 -0500
parents
children