| 0 | 1 """ | 
|  | 2 July 1 2009 added relatedness filter - fo/oo or all | 
|  | 3 released under the terms of the LGPL | 
|  | 4 copyright ross lazarus August 2007 | 
|  | 5 for the rgenetics project | 
|  | 6 | 
|  | 7 Special galaxy tool for the camp2007 data | 
|  | 8 Allows grabbing genotypes from an arbitrary region | 
|  | 9 | 
|  | 10 Needs a mongo results file in the location hardwired below or could be passed in as | 
|  | 11 a library parameter - but this file must have a very specific structure | 
|  | 12 rs chrom offset float1...floatn | 
|  | 13 | 
|  | 14 called as | 
|  | 15 | 
|  | 16     <command interpreter="python2.4"> | 
|  | 17         campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn" | 
|  | 18     </command> | 
|  | 19 | 
|  | 20 | 
|  | 21 """ | 
|  | 22 | 
|  | 23 | 
|  | 24 import sys, array, os, string | 
|  | 25 from rgutils import galhtmlprefix,plinke,readMap | 
|  | 26 | 
|  | 27 progname = os.path.split(sys.argv[0])[1] | 
|  | 28 | 
|  | 29 | 
|  | 30 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'} | 
|  | 31 | 
|  | 32 def doImport(outfile='test',flist=[]): | 
|  | 33     """ import into one of the new html composite data types for Rgenetics | 
|  | 34         Dan Blankenberg with mods by Ross Lazarus | 
|  | 35         October 2007 | 
|  | 36     """ | 
|  | 37     out = open(outfile,'w') | 
|  | 38     out.write(galhtmlprefix % progname) | 
|  | 39 | 
|  | 40     if len(flist) > 0: | 
|  | 41         out.write('<ol>\n') | 
|  | 42         for i, data in enumerate( flist ): | 
|  | 43            out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) | 
|  | 44         out.write('</ol>\n') | 
|  | 45     else: | 
|  | 46            out.write('No files found') | 
|  | 47     out.write("</div></body></html>") | 
|  | 48     out.close() | 
|  | 49 | 
|  | 50 def setupPedFilter(relfilter='oo',dfile=None): | 
|  | 51     """ figure out who to drop to satisfy relative filtering | 
|  | 52     note single offspring only from each family id | 
|  | 53     ordering of pdict keys makes this 'random' as the first one only is | 
|  | 54     kept if there are multiple sibs from same familyid. | 
|  | 55     """ | 
|  | 56     dropId = {} | 
|  | 57     keepoff = (relfilter == 'oo') | 
|  | 58     keepfounder = (relfilter == 'fo') | 
|  | 59     pdict = {} | 
|  | 60     for row in dfile: | 
|  | 61         rowl = row.strip().split() | 
|  | 62         if len(rowl) > 6: | 
|  | 63             idk = (rowl[0],rowl[1]) | 
|  | 64             pa =  (rowl[0],rowl[2]) # key for father | 
|  | 65             ma = (rowl[0],rowl[3]) # and mother | 
|  | 66             pdict[idk] = (pa,ma) | 
|  | 67     dfile.seek(0) # rewind | 
|  | 68     pk = pdict.keys() | 
|  | 69     for p in pk: | 
|  | 70         parents = pdict[p] | 
|  | 71         if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file | 
|  | 72             if keepfounder: | 
|  | 73                 dropId[p] = 1 # flag for removal | 
|  | 74         elif keepoff: | 
|  | 75             dropId[p] = 1 # flag for removal | 
|  | 76     if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...! | 
|  | 77         famseen = {} | 
|  | 78         for p in pk: # look for multiples from same family - drop all but first | 
|  | 79              famid = p[0] | 
|  | 80              if famseen.get(famid,None): | 
|  | 81                  dropId[p] = 1 # already got one from this family | 
|  | 82              famseen.setdefault(famid,1) | 
|  | 83     return dropId | 
|  | 84 | 
|  | 85 def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): | 
|  | 86     """ fbat format version | 
|  | 87     """ | 
|  | 88     outname = os.path.join(outdir,basename) | 
|  | 89     pedfname = '%s.ped' % outname | 
|  | 90     ofile = file(pedfname, 'w') | 
|  | 91     rsl = ' '.join(rslist) # rslist for fbat | 
|  | 92     ofile.write(rsl) | 
|  | 93     s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50]) | 
|  | 94     lf.write(s) | 
|  | 95     ofile.write('\n') | 
|  | 96     nrows = 0 | 
|  | 97     for line in dfile: | 
|  | 98         line = line.strip() | 
|  | 99         if not line: | 
|  | 100             continue | 
|  | 101         line = line.replace('D','N') | 
|  | 102         fields = line.split() | 
|  | 103         preamble = fields[:6] | 
|  | 104         idk = (preamble[0],preamble[1]) | 
|  | 105         dropme = dropId.get(idk,None) | 
|  | 106         if not dropme: | 
|  | 107             g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] | 
|  | 108             g = ' '.join(g) | 
|  | 109             g = g.split() # we'll get there | 
|  | 110             g = [atrandic.get(x,'0') for x in g] # numeric alleles... | 
|  | 111             # hack for framingham ND | 
|  | 112             ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) | 
|  | 113             nrows += 1 | 
|  | 114     ofile.close() | 
|  | 115     loglist = open(logfile,'r').readlines() # retrieve log to add to html file | 
|  | 116     doImport(outfile,[pedfname,],loglist=loglist) | 
|  | 117     return nrows,pedfname | 
|  | 118 | 
|  | 119 def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): | 
|  | 120     """ split out | 
|  | 121     """ | 
|  | 122     outname = os.path.join(outdir,basename) | 
|  | 123     mapfname = '%s.map' % outname | 
|  | 124     pedfname = '%s.ped' % outname | 
|  | 125     ofile = file(pedfname, 'w') | 
|  | 126     # make a map file in the lped library | 
|  | 127     mf = file(mapfname,'w') | 
|  | 128     map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order | 
|  | 129     mf.write('%s\n' % '\n'.join(map)) | 
|  | 130     mf.close() | 
|  | 131     nrows = 0 | 
|  | 132     for line in dfile: | 
|  | 133         line = line.strip() | 
|  | 134         if not line: | 
|  | 135             continue | 
|  | 136         #line = line.replace('D','N') | 
|  | 137         fields = line.split() | 
|  | 138         preamble = fields[:6] | 
|  | 139         idk = (preamble[0],preamble[1]) | 
|  | 140         dropme = dropId.get(idk,None) | 
|  | 141         if not dropme: | 
|  | 142             g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] | 
|  | 143             g = ' '.join(g) | 
|  | 144             g = g.split() # we'll get there | 
|  | 145             g = [atrandic.get(x,'0') for x in g] # numeric alleles... | 
|  | 146             # hack for framingham ND | 
|  | 147             ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) | 
|  | 148             nrows += 1 | 
|  | 149     ofile.close() | 
|  | 150     loglist = open(logfile,'r').readlines() # retrieve log to add to html file | 
|  | 151     doImport(outfile,[mapfname,pedfname,logfile]) | 
|  | 152     return nrows,pedfname | 
|  | 153 | 
|  | 154 def subset(): | 
|  | 155     """  ### Sanity check the arguments | 
|  | 156     now passed in as | 
|  | 157     <command interpreter="python"> | 
|  | 158         rgPedSub.py $script_file | 
|  | 159     </command> | 
|  | 160 | 
|  | 161     with | 
|  | 162     <configfiles> | 
|  | 163     <configfile name="script_file"> | 
|  | 164     title~~~~$title | 
|  | 165     output1~~~~$output1 | 
|  | 166     log_file~~~~$log_file | 
|  | 167     userId~~~~$userId | 
|  | 168     outformat~~~~$outformat | 
|  | 169     outdir~~~~$output1.extra_files_path | 
|  | 170     relfilter~~~~$relfilter | 
|  | 171     #if $d.source=='library' | 
|  | 172     inped~~~~$d.lpedIn | 
|  | 173     #else | 
|  | 174     inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name | 
|  | 175     #end if | 
|  | 176     #if $m.mtype=='grslist' | 
|  | 177     rslist~~~~$m.rslist | 
|  | 178     region~~~~ | 
|  | 179     #else | 
|  | 180     rslist~~~~ | 
|  | 181     region~~~~$m.region | 
|  | 182     #end if | 
|  | 183     </configfile> | 
|  | 184     </configfiles> | 
|  | 185     """ | 
|  | 186     sep = '~~~~' # arbitrary choice | 
|  | 187     conf = {} | 
|  | 188     if len(sys.argv) < 2: | 
|  | 189         print >> sys.stderr, "No configuration file passed as a parameter - cannot run" | 
|  | 190         sys.exit(1) | 
|  | 191     configf = sys.argv[1] | 
|  | 192     config = file(configf,'r').readlines() | 
|  | 193     for row in config: | 
|  | 194         row = row.strip() | 
|  | 195         if len(row) > 0: | 
|  | 196             try: | 
|  | 197                 key,valu = row.split(sep) | 
|  | 198                 conf[key] = valu | 
|  | 199             except: | 
|  | 200                 pass | 
|  | 201     ss = '%s%s' % (string.punctuation,string.whitespace) | 
|  | 202     ptran =  string.maketrans(ss,'_'*len(ss)) | 
|  | 203     ### Figure out what genomic region we are interested in | 
|  | 204     region = conf.get('region','') | 
|  | 205     orslist = conf.get('rslist','').replace('X',' ').lower() | 
|  | 206     orslist = orslist.replace(',',' ').lower() | 
|  | 207     # galaxy replaces newlines with XX - go figure | 
|  | 208     title = conf.get('title','').translate(ptran) # for outputs | 
|  | 209     outfile = conf.get('output1','') | 
|  | 210     outdir = conf.get('outdir','') | 
|  | 211     try: | 
|  | 212         os.makedirs(outdir) | 
|  | 213     except: | 
|  | 214         pass | 
|  | 215     outformat = conf.get('outformat','lped') | 
|  | 216     basename = conf.get('basename',title) | 
|  | 217     logfile = os.path.join(outdir,'%s.log' % title) | 
|  | 218     userId = conf.get('userId','') # for library | 
|  | 219     pedFileBase = conf.get('inped','') | 
|  | 220     relfilter = conf.get('relfilter','') | 
|  | 221     MAP_FILE = '%s.map' % pedFileBase | 
|  | 222     DATA_FILE = '%s.ped' % pedFileBase | 
|  | 223     title = conf.get('title','lped subset') | 
|  | 224     lf = file(logfile,'w') | 
|  | 225     lf.write('config file %s = \n' % configf) | 
|  | 226     lf.write(''.join(config)) | 
|  | 227     c = '' | 
|  | 228     spos = epos = 0 | 
|  | 229     rslist = [] | 
|  | 230     rsdict = {} | 
|  | 231     if region > '': | 
|  | 232         try: # TODO make a regexp? | 
|  | 233             c,rest = region.split(':') | 
|  | 234             c = c.replace('chr','') | 
|  | 235             rest = rest.replace(',','') # remove commas | 
|  | 236             spos,epos = rest.split('-') | 
|  | 237             spos = int(spos) | 
|  | 238             epos = int(epos) | 
|  | 239             s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos) | 
|  | 240             lf.write(s) | 
|  | 241         except: | 
|  | 242             s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region) | 
|  | 243             lf.write(s) | 
|  | 244             lf.close() | 
|  | 245             sys.exit(1) | 
|  | 246     else: | 
|  | 247         rslist = orslist.split() # galaxy replaces newlines with XX - go figure | 
|  | 248         rsdict = dict(zip(rslist,rslist)) | 
|  | 249     allmarkers = False | 
|  | 250     if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something | 
|  | 251         allmarkers = True | 
|  | 252     ### Figure out which markers are in this region | 
|  | 253     markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos) | 
|  | 254     if len(rslist) == 0: | 
|  | 255             s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3]) | 
|  | 256             lf.write(s) | 
|  | 257             lf.write('\n') | 
|  | 258             lf.close() | 
|  | 259             sys.exit(1) | 
|  | 260     s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5]) | 
|  | 261     lf.write(s) | 
|  | 262     try: | 
|  | 263         dfile = open(DATA_FILE, 'r') | 
|  | 264     except: # bad input file name? | 
|  | 265         s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE) | 
|  | 266         lf.write(s) | 
|  | 267         lf.write('\n') | 
|  | 268         lf.close() | 
|  | 269         print >> sys.stdout, s | 
|  | 270         raise | 
|  | 271         sys.exit(1) | 
|  | 272     if relfilter <> 'all': # must read pedigree and figure out who to drop | 
|  | 273         dropId = setupPedFilter(relfilter=relfilter,dfile=dfile) | 
|  | 274     else: | 
|  | 275         dropId = {} | 
|  | 276     wewant = [(6+(2*snpcols[x])) for x in rslist] # | 
|  | 277     # column indices of first geno of each marker pair to get the markers into genomic | 
|  | 278     ### ... and then parse the rest of the ped file to pull out | 
|  | 279     ### the genotypes for all subjects for those markers | 
|  | 280     # /usr/local/galaxy/data/rg/1/lped/ | 
|  | 281     if len(dropId.keys()) > 0: | 
|  | 282         s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter | 
|  | 283         lf.write(s) | 
|  | 284         if relfilter == 'oo': | 
|  | 285             s = '## note that one random offspring from each family was kept if there were multiple offspring\n' | 
|  | 286             lf.write(s) | 
|  | 287         s = 'FamilyId\tSubjectId\n' | 
|  | 288         lf.write(s) | 
|  | 289         dk = dropId.keys() | 
|  | 290         dk.sort() | 
|  | 291         for k in dk: | 
|  | 292             s = '%s\t%s\n' % (k[0],k[1]) | 
|  | 293             lf.write(s) | 
|  | 294     lf.write('\n') | 
|  | 295     lf.close() | 
|  | 296     if outformat == 'lped': | 
|  | 297         nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile, | 
|  | 298                  wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) | 
|  | 299     elif outformat == 'fped': | 
|  | 300         nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile, | 
|  | 301                   wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) | 
|  | 302     dfile.close() | 
|  | 303 | 
|  | 304 if __name__ == "__main__": | 
|  | 305     subset() |