Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgPedSub.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 """ | |
| 2 July 1 2009 added relatedness filter - fo/oo or all | |
| 3 released under the terms of the LGPL | |
| 4 copyright ross lazarus August 2007 | |
| 5 for the rgenetics project | |
| 6 | |
| 7 Special galaxy tool for the camp2007 data | |
| 8 Allows grabbing genotypes from an arbitrary region | |
| 9 | |
| 10 Needs a mongo results file in the location hardwired below or could be passed in as | |
| 11 a library parameter - but this file must have a very specific structure | |
| 12 rs chrom offset float1...floatn | |
| 13 | |
| 14 called as | |
| 15 | |
| 16 <command interpreter="python2.4"> | |
| 17 campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn" | |
| 18 </command> | |
| 19 | |
| 20 | |
| 21 """ | |
| 22 | |
| 23 | |
| 24 import sys, array, os, string | |
| 25 from rgutils import galhtmlprefix,plinke,readMap | |
| 26 | |
| 27 progname = os.path.split(sys.argv[0])[1] | |
| 28 | |
| 29 | |
| 30 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'} | |
| 31 | |
| 32 def doImport(outfile='test',flist=[]): | |
| 33 """ import into one of the new html composite data types for Rgenetics | |
| 34 Dan Blankenberg with mods by Ross Lazarus | |
| 35 October 2007 | |
| 36 """ | |
| 37 out = open(outfile,'w') | |
| 38 out.write(galhtmlprefix % progname) | |
| 39 | |
| 40 if len(flist) > 0: | |
| 41 out.write('<ol>\n') | |
| 42 for i, data in enumerate( flist ): | |
| 43 out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) | |
| 44 out.write('</ol>\n') | |
| 45 else: | |
| 46 out.write('No files found') | |
| 47 out.write("</div></body></html>") | |
| 48 out.close() | |
| 49 | |
| 50 def setupPedFilter(relfilter='oo',dfile=None): | |
| 51 """ figure out who to drop to satisfy relative filtering | |
| 52 note single offspring only from each family id | |
| 53 ordering of pdict keys makes this 'random' as the first one only is | |
| 54 kept if there are multiple sibs from same familyid. | |
| 55 """ | |
| 56 dropId = {} | |
| 57 keepoff = (relfilter == 'oo') | |
| 58 keepfounder = (relfilter == 'fo') | |
| 59 pdict = {} | |
| 60 for row in dfile: | |
| 61 rowl = row.strip().split() | |
| 62 if len(rowl) > 6: | |
| 63 idk = (rowl[0],rowl[1]) | |
| 64 pa = (rowl[0],rowl[2]) # key for father | |
| 65 ma = (rowl[0],rowl[3]) # and mother | |
| 66 pdict[idk] = (pa,ma) | |
| 67 dfile.seek(0) # rewind | |
| 68 pk = pdict.keys() | |
| 69 for p in pk: | |
| 70 parents = pdict[p] | |
| 71 if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file | |
| 72 if keepfounder: | |
| 73 dropId[p] = 1 # flag for removal | |
| 74 elif keepoff: | |
| 75 dropId[p] = 1 # flag for removal | |
| 76 if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...! | |
| 77 famseen = {} | |
| 78 for p in pk: # look for multiples from same family - drop all but first | |
| 79 famid = p[0] | |
| 80 if famseen.get(famid,None): | |
| 81 dropId[p] = 1 # already got one from this family | |
| 82 famseen.setdefault(famid,1) | |
| 83 return dropId | |
| 84 | |
| 85 def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): | |
| 86 """ fbat format version | |
| 87 """ | |
| 88 outname = os.path.join(outdir,basename) | |
| 89 pedfname = '%s.ped' % outname | |
| 90 ofile = file(pedfname, 'w') | |
| 91 rsl = ' '.join(rslist) # rslist for fbat | |
| 92 ofile.write(rsl) | |
| 93 s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50]) | |
| 94 lf.write(s) | |
| 95 ofile.write('\n') | |
| 96 nrows = 0 | |
| 97 for line in dfile: | |
| 98 line = line.strip() | |
| 99 if not line: | |
| 100 continue | |
| 101 line = line.replace('D','N') | |
| 102 fields = line.split() | |
| 103 preamble = fields[:6] | |
| 104 idk = (preamble[0],preamble[1]) | |
| 105 dropme = dropId.get(idk,None) | |
| 106 if not dropme: | |
| 107 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] | |
| 108 g = ' '.join(g) | |
| 109 g = g.split() # we'll get there | |
| 110 g = [atrandic.get(x,'0') for x in g] # numeric alleles... | |
| 111 # hack for framingham ND | |
| 112 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) | |
| 113 nrows += 1 | |
| 114 ofile.close() | |
| 115 loglist = open(logfile,'r').readlines() # retrieve log to add to html file | |
| 116 doImport(outfile,[pedfname,],loglist=loglist) | |
| 117 return nrows,pedfname | |
| 118 | |
| 119 def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): | |
| 120 """ split out | |
| 121 """ | |
| 122 outname = os.path.join(outdir,basename) | |
| 123 mapfname = '%s.map' % outname | |
| 124 pedfname = '%s.ped' % outname | |
| 125 ofile = file(pedfname, 'w') | |
| 126 # make a map file in the lped library | |
| 127 mf = file(mapfname,'w') | |
| 128 map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order | |
| 129 mf.write('%s\n' % '\n'.join(map)) | |
| 130 mf.close() | |
| 131 nrows = 0 | |
| 132 for line in dfile: | |
| 133 line = line.strip() | |
| 134 if not line: | |
| 135 continue | |
| 136 #line = line.replace('D','N') | |
| 137 fields = line.split() | |
| 138 preamble = fields[:6] | |
| 139 idk = (preamble[0],preamble[1]) | |
| 140 dropme = dropId.get(idk,None) | |
| 141 if not dropme: | |
| 142 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] | |
| 143 g = ' '.join(g) | |
| 144 g = g.split() # we'll get there | |
| 145 g = [atrandic.get(x,'0') for x in g] # numeric alleles... | |
| 146 # hack for framingham ND | |
| 147 ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) | |
| 148 nrows += 1 | |
| 149 ofile.close() | |
| 150 loglist = open(logfile,'r').readlines() # retrieve log to add to html file | |
| 151 doImport(outfile,[mapfname,pedfname,logfile]) | |
| 152 return nrows,pedfname | |
| 153 | |
| 154 def subset(): | |
| 155 """ ### Sanity check the arguments | |
| 156 now passed in as | |
| 157 <command interpreter="python"> | |
| 158 rgPedSub.py $script_file | |
| 159 </command> | |
| 160 | |
| 161 with | |
| 162 <configfiles> | |
| 163 <configfile name="script_file"> | |
| 164 title~~~~$title | |
| 165 output1~~~~$output1 | |
| 166 log_file~~~~$log_file | |
| 167 userId~~~~$userId | |
| 168 outformat~~~~$outformat | |
| 169 outdir~~~~$output1.extra_files_path | |
| 170 relfilter~~~~$relfilter | |
| 171 #if $d.source=='library' | |
| 172 inped~~~~$d.lpedIn | |
| 173 #else | |
| 174 inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name | |
| 175 #end if | |
| 176 #if $m.mtype=='grslist' | |
| 177 rslist~~~~$m.rslist | |
| 178 region~~~~ | |
| 179 #else | |
| 180 rslist~~~~ | |
| 181 region~~~~$m.region | |
| 182 #end if | |
| 183 </configfile> | |
| 184 </configfiles> | |
| 185 """ | |
| 186 sep = '~~~~' # arbitrary choice | |
| 187 conf = {} | |
| 188 if len(sys.argv) < 2: | |
| 189 print >> sys.stderr, "No configuration file passed as a parameter - cannot run" | |
| 190 sys.exit(1) | |
| 191 configf = sys.argv[1] | |
| 192 config = file(configf,'r').readlines() | |
| 193 for row in config: | |
| 194 row = row.strip() | |
| 195 if len(row) > 0: | |
| 196 try: | |
| 197 key,valu = row.split(sep) | |
| 198 conf[key] = valu | |
| 199 except: | |
| 200 pass | |
| 201 ss = '%s%s' % (string.punctuation,string.whitespace) | |
| 202 ptran = string.maketrans(ss,'_'*len(ss)) | |
| 203 ### Figure out what genomic region we are interested in | |
| 204 region = conf.get('region','') | |
| 205 orslist = conf.get('rslist','').replace('X',' ').lower() | |
| 206 orslist = orslist.replace(',',' ').lower() | |
| 207 # galaxy replaces newlines with XX - go figure | |
| 208 title = conf.get('title','').translate(ptran) # for outputs | |
| 209 outfile = conf.get('output1','') | |
| 210 outdir = conf.get('outdir','') | |
| 211 try: | |
| 212 os.makedirs(outdir) | |
| 213 except: | |
| 214 pass | |
| 215 outformat = conf.get('outformat','lped') | |
| 216 basename = conf.get('basename',title) | |
| 217 logfile = os.path.join(outdir,'%s.log' % title) | |
| 218 userId = conf.get('userId','') # for library | |
| 219 pedFileBase = conf.get('inped','') | |
| 220 relfilter = conf.get('relfilter','') | |
| 221 MAP_FILE = '%s.map' % pedFileBase | |
| 222 DATA_FILE = '%s.ped' % pedFileBase | |
| 223 title = conf.get('title','lped subset') | |
| 224 lf = file(logfile,'w') | |
| 225 lf.write('config file %s = \n' % configf) | |
| 226 lf.write(''.join(config)) | |
| 227 c = '' | |
| 228 spos = epos = 0 | |
| 229 rslist = [] | |
| 230 rsdict = {} | |
| 231 if region > '': | |
| 232 try: # TODO make a regexp? | |
| 233 c,rest = region.split(':') | |
| 234 c = c.replace('chr','') | |
| 235 rest = rest.replace(',','') # remove commas | |
| 236 spos,epos = rest.split('-') | |
| 237 spos = int(spos) | |
| 238 epos = int(epos) | |
| 239 s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos) | |
| 240 lf.write(s) | |
| 241 except: | |
| 242 s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region) | |
| 243 lf.write(s) | |
| 244 lf.close() | |
| 245 sys.exit(1) | |
| 246 else: | |
| 247 rslist = orslist.split() # galaxy replaces newlines with XX - go figure | |
| 248 rsdict = dict(zip(rslist,rslist)) | |
| 249 allmarkers = False | |
| 250 if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something | |
| 251 allmarkers = True | |
| 252 ### Figure out which markers are in this region | |
| 253 markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos) | |
| 254 if len(rslist) == 0: | |
| 255 s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3]) | |
| 256 lf.write(s) | |
| 257 lf.write('\n') | |
| 258 lf.close() | |
| 259 sys.exit(1) | |
| 260 s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5]) | |
| 261 lf.write(s) | |
| 262 try: | |
| 263 dfile = open(DATA_FILE, 'r') | |
| 264 except: # bad input file name? | |
| 265 s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE) | |
| 266 lf.write(s) | |
| 267 lf.write('\n') | |
| 268 lf.close() | |
| 269 print >> sys.stdout, s | |
| 270 raise | |
| 271 sys.exit(1) | |
| 272 if relfilter <> 'all': # must read pedigree and figure out who to drop | |
| 273 dropId = setupPedFilter(relfilter=relfilter,dfile=dfile) | |
| 274 else: | |
| 275 dropId = {} | |
| 276 wewant = [(6+(2*snpcols[x])) for x in rslist] # | |
| 277 # column indices of first geno of each marker pair to get the markers into genomic | |
| 278 ### ... and then parse the rest of the ped file to pull out | |
| 279 ### the genotypes for all subjects for those markers | |
| 280 # /usr/local/galaxy/data/rg/1/lped/ | |
| 281 if len(dropId.keys()) > 0: | |
| 282 s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter | |
| 283 lf.write(s) | |
| 284 if relfilter == 'oo': | |
| 285 s = '## note that one random offspring from each family was kept if there were multiple offspring\n' | |
| 286 lf.write(s) | |
| 287 s = 'FamilyId\tSubjectId\n' | |
| 288 lf.write(s) | |
| 289 dk = dropId.keys() | |
| 290 dk.sort() | |
| 291 for k in dk: | |
| 292 s = '%s\t%s\n' % (k[0],k[1]) | |
| 293 lf.write(s) | |
| 294 lf.write('\n') | |
| 295 lf.close() | |
| 296 if outformat == 'lped': | |
| 297 nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile, | |
| 298 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) | |
| 299 elif outformat == 'fped': | |
| 300 nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile, | |
| 301 wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) | |
| 302 dfile.close() | |
| 303 | |
| 304 if __name__ == "__main__": | |
| 305 subset() |
