diff tools/rgenetics/rgPedSub.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgPedSub.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,305 @@
+"""
+July 1 2009 added relatedness filter - fo/oo or all
+released under the terms of the LGPL
+copyright ross lazarus August 2007 
+for the rgenetics project
+
+Special galaxy tool for the camp2007 data
+Allows grabbing genotypes from an arbitrary region
+
+Needs a mongo results file in the location hardwired below or could be passed in as
+a library parameter - but this file must have a very specific structure
+rs chrom offset float1...floatn
+
+called as
+
+    <command interpreter="python2.4">
+        campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn"
+    </command>
+
+
+"""
+
+
+import sys, array, os, string
+from rgutils import galhtmlprefix,plinke,readMap
+
+progname = os.path.split(sys.argv[0])[1]
+
+
+atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
+
+def doImport(outfile='test',flist=[]):
+    """ import into one of the new html composite data types for Rgenetics
+        Dan Blankenberg with mods by Ross Lazarus 
+        October 2007
+    """
+    out = open(outfile,'w')
+    out.write(galhtmlprefix % progname)
+
+    if len(flist) > 0:
+        out.write('<ol>\n')
+        for i, data in enumerate( flist ):
+           out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
+        out.write('</ol>\n')
+    else:
+           out.write('No files found')
+    out.write("</div></body></html>")
+    out.close()
+
+def setupPedFilter(relfilter='oo',dfile=None):
+    """ figure out who to drop to satisfy relative filtering
+    note single offspring only from each family id
+    ordering of pdict keys makes this 'random' as the first one only is
+    kept if there are multiple sibs from same familyid.
+    """
+    dropId = {}
+    keepoff = (relfilter == 'oo')
+    keepfounder = (relfilter == 'fo')
+    pdict = {}
+    for row in dfile:
+        rowl = row.strip().split()
+        if len(rowl) > 6:
+            idk = (rowl[0],rowl[1])
+            pa =  (rowl[0],rowl[2]) # key for father
+            ma = (rowl[0],rowl[3]) # and mother
+            pdict[idk] = (pa,ma)
+    dfile.seek(0) # rewind
+    pk = pdict.keys()
+    for p in pk:
+        parents = pdict[p]
+        if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file
+            if keepfounder:
+                dropId[p] = 1 # flag for removal
+        elif keepoff:
+            dropId[p] = 1 # flag for removal 
+    if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...!   
+        famseen = {}
+        for p in pk: # look for multiples from same family - drop all but first
+             famid = p[0]
+             if famseen.get(famid,None):
+                 dropId[p] = 1 # already got one from this family
+             famseen.setdefault(famid,1)
+    return dropId
+   
+def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
+    """ fbat format version
+    """
+    outname = os.path.join(outdir,basename)
+    pedfname = '%s.ped' % outname
+    ofile = file(pedfname, 'w')
+    rsl = ' '.join(rslist) # rslist for fbat
+    ofile.write(rsl)
+    s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50])
+    lf.write(s)
+    ofile.write('\n')
+    nrows = 0
+    for line in dfile:
+        line = line.strip()
+        if not line:
+            continue
+        line = line.replace('D','N')
+        fields = line.split()
+        preamble = fields[:6]
+        idk = (preamble[0],preamble[1])
+        dropme = dropId.get(idk,None)
+        if not dropme:
+            g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
+            g = ' '.join(g)
+            g = g.split() # we'll get there
+            g = [atrandic.get(x,'0') for x in g] # numeric alleles...
+            # hack for framingham ND
+            ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
+            nrows += 1
+    ofile.close()
+    loglist = open(logfile,'r').readlines() # retrieve log to add to html file
+    doImport(outfile,[pedfname,],loglist=loglist)
+    return nrows,pedfname
+
+def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None):
+    """ split out
+    """
+    outname = os.path.join(outdir,basename)
+    mapfname = '%s.map' % outname
+    pedfname = '%s.ped' % outname
+    ofile = file(pedfname, 'w')
+    # make a map file in the lped library
+    mf = file(mapfname,'w')
+    map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order
+    mf.write('%s\n' % '\n'.join(map))
+    mf.close()
+    nrows = 0
+    for line in dfile:
+        line = line.strip()
+        if not line:
+            continue
+        #line = line.replace('D','N')
+        fields = line.split()
+        preamble = fields[:6]
+        idk = (preamble[0],preamble[1])
+        dropme = dropId.get(idk,None)
+        if not dropme:
+            g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
+            g = ' '.join(g)
+            g = g.split() # we'll get there
+            g = [atrandic.get(x,'0') for x in g] # numeric alleles...
+            # hack for framingham ND
+            ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
+            nrows += 1
+    ofile.close()
+    loglist = open(logfile,'r').readlines() # retrieve log to add to html file
+    doImport(outfile,[mapfname,pedfname,logfile])
+    return nrows,pedfname
+    
+def subset():
+    """  ### Sanity check the arguments
+    now passed in as 
+    <command interpreter="python">
+        rgPedSub.py $script_file
+    </command>
+
+    with
+    <configfiles>
+    <configfile name="script_file">
+    title~~~~$title
+    output1~~~~$output1
+    log_file~~~~$log_file
+    userId~~~~$userId
+    outformat~~~~$outformat
+    outdir~~~~$output1.extra_files_path
+    relfilter~~~~$relfilter
+    #if $d.source=='library'
+    inped~~~~$d.lpedIn
+    #else 
+    inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name
+    #end if
+    #if $m.mtype=='grslist'
+    rslist~~~~$m.rslist
+    region~~~~	
+    #else 
+    rslist~~~~	
+    region~~~~$m.region
+    #end if
+    </configfile>
+    </configfiles>    
+    """
+    sep = '~~~~' # arbitrary choice
+    conf = {}
+    if len(sys.argv) < 2:
+        print >> sys.stderr, "No configuration file passed as a parameter - cannot run"
+        sys.exit(1)
+    configf = sys.argv[1]
+    config = file(configf,'r').readlines()
+    for row in config:
+        row = row.strip()
+        if len(row) > 0:
+            try:
+                key,valu = row.split(sep)
+                conf[key] = valu
+            except:
+                pass
+    ss = '%s%s' % (string.punctuation,string.whitespace)
+    ptran =  string.maketrans(ss,'_'*len(ss))
+    ### Figure out what genomic region we are interested in
+    region = conf.get('region','')
+    orslist = conf.get('rslist','').replace('X',' ').lower()
+    orslist = orslist.replace(',',' ').lower()
+    # galaxy replaces newlines with XX - go figure
+    title = conf.get('title','').translate(ptran) # for outputs
+    outfile = conf.get('output1','')
+    outdir = conf.get('outdir','')
+    try:
+        os.makedirs(outdir)
+    except:
+        pass
+    outformat = conf.get('outformat','lped')
+    basename = conf.get('basename',title)
+    logfile = os.path.join(outdir,'%s.log' % title) 
+    userId = conf.get('userId','') # for library
+    pedFileBase = conf.get('inped','')
+    relfilter = conf.get('relfilter','')
+    MAP_FILE = '%s.map' % pedFileBase
+    DATA_FILE = '%s.ped' % pedFileBase    
+    title = conf.get('title','lped subset')
+    lf = file(logfile,'w')
+    lf.write('config file %s = \n' % configf)
+    lf.write(''.join(config))
+    c = ''
+    spos = epos = 0
+    rslist = []
+    rsdict = {}
+    if region > '':
+        try: # TODO make a regexp?
+            c,rest = region.split(':')
+            c = c.replace('chr','')
+            rest = rest.replace(',','') # remove commas
+            spos,epos = rest.split('-')
+            spos = int(spos)
+            epos = int(epos)
+            s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos)
+            lf.write(s)
+        except:
+            s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region)
+            lf.write(s)
+            lf.close()
+            sys.exit(1)
+    else:
+        rslist = orslist.split() # galaxy replaces newlines with XX - go figure
+        rsdict = dict(zip(rslist,rslist))
+    allmarkers = False
+    if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something
+        allmarkers = True
+    ### Figure out which markers are in this region
+    markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos)
+    if len(rslist) == 0:
+            s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3])
+            lf.write(s)
+            lf.write('\n')
+            lf.close()
+            sys.exit(1)
+    s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5])
+    lf.write(s)
+    try:
+        dfile = open(DATA_FILE, 'r')
+    except: # bad input file name?
+        s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE)
+        lf.write(s)
+        lf.write('\n')
+        lf.close()
+        print >> sys.stdout, s
+        raise
+        sys.exit(1)
+    if relfilter <> 'all': # must read pedigree and figure out who to drop
+        dropId = setupPedFilter(relfilter=relfilter,dfile=dfile)
+    else:
+        dropId = {}
+    wewant = [(6+(2*snpcols[x])) for x in rslist] # 
+    # column indices of first geno of each marker pair to get the markers into genomic
+    ### ... and then parse the rest of the ped file to pull out
+    ### the genotypes for all subjects for those markers
+    # /usr/local/galaxy/data/rg/1/lped/
+    if len(dropId.keys()) > 0:
+        s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter
+        lf.write(s)
+        if relfilter == 'oo':
+            s = '## note that one random offspring from each family was kept if there were multiple offspring\n'
+            lf.write(s)
+        s = 'FamilyId\tSubjectId\n'
+        lf.write(s)
+        dk = dropId.keys()
+        dk.sort()
+        for k in dk:
+            s = '%s\t%s\n' % (k[0],k[1])
+            lf.write(s)
+    lf.write('\n')
+    lf.close()
+    if outformat == 'lped':
+        nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile,
+                 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
+    elif outformat == 'fped':
+        nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile,
+                  wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile)
+    dfile.close()    
+
+if __name__ == "__main__":
+    subset()