comparison tools/rgenetics/rgHaploView.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 released under the terms of the LGPL
3 copyright ross lazarus August 2007
4 for the rgenetics project
5
6 Special galaxy tool for the camp2007 data
7 Allows grabbing genotypes from an arbitrary region and estimating
8 ld using haploview
9
10 stoopid haploview won't allow control of dest directory for plots - always end
11 up where the data came from - need to futz to get it where it belongs
12
13 Needs a mongo results file in the location hardwired below or could be passed in as
14 a library parameter - but this file must have a very specific structure
15 rs chrom offset float1...floatn
16
17
18 """
19
20
21 import sys, array, os, string, tempfile, shutil, subprocess, glob
22 from rgutils import galhtmlprefix
23
24 progname = os.path.split(sys.argv[0])[1]
25
26 javabin = 'java'
27 #hvbin = '/usr/local/bin/Haploview.jar'
28 #hvbin = '/home/universe/linux-i686/haploview/Haploview.jar'
29 # get this from tool as a parameter - can use
30
31
32
33 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
34
35 class NullDevice:
36 """ a dev/null for ignoring output
37 """
38 def write(self, s):
39 pass
40
41 class ldPlot:
42
43 def __init__(self, argv=[]):
44 """
45 setup
46 """
47 self.args=argv
48 self.parseArgs(argv=self.args)
49 self.setupRegions()
50
51 def parseArgs(self,argv=[]):
52 """
53 """
54 ts = '%s%s' % (string.punctuation,string.whitespace)
55 ptran = string.maketrans(ts,'_'*len(ts))
56 ### Figure out what genomic region we are interested in
57 self.region = argv[1]
58 self.orslist = argv[2].replace('X',' ').lower() # galaxy replaces newlines with XX - go figure
59 self.title = argv[3].translate(ptran)
60 # for outputs
61 self.outfile = argv[4]
62 self.logfn = 'Log_%s.txt' % (self.title)
63 self.histextra = argv[5]
64 self.base_name = argv[6]
65 self.pedFileBase = os.path.join(self.histextra,self.base_name)
66 print 'pedfilebase=%s' % self.pedFileBase
67 self.minMaf=argv[7]
68 if self.minMaf:
69 try:
70 self.minMaf = float(self.minMaf)
71 except:
72 self.minMaf = 0.0
73 self.maxDist=argv[8] or None
74 self.ldType=argv[9] or 'RSQ'
75 self.hiRes = (argv[10].lower() == 'hi')
76 self.memSize= argv[11] or '1000'
77 self.memSize = int(self.memSize)
78 self.outfpath = argv[12]
79 self.infotrack = False # note that otherwise this breaks haploview in headless mode
80 #infotrack = argv[13] == 'info'
81 # this fails in headless mode as at april 2010 with haploview 4.2
82 self.tagr2 = argv[14] or '0.8'
83 hmpanels = argv[15] # eg "['CEU','YRI']"
84 if hmpanels:
85 hmpanels = hmpanels.replace('[','')
86 hmpanels = hmpanels.replace(']','')
87 hmpanels = hmpanels.replace("'",'')
88 hmpanels = hmpanels.split(',')
89 self.hmpanels = hmpanels
90 self.hvbin = argv[16] # added rml june 2008
91 self.bindir = os.path.split(self.hvbin)[0]
92 # jan 2010 - always assume utes are on path to avoid platform problems
93 self.pdfjoin = 'pdfjoin' # os.path.join(bindir,'pdfjoin')
94 self.pdfnup = 'pdfnup' # os.path.join(bindir,'pdfnup')
95 self.mogrify = 'mogrify' # os.path.join(bindir,'mogrify')
96 self.convert = 'convert' # os.path.join(bindir,'convert')
97 self.log_file = os.path.join(self.outfpath,self.logfn)
98 self.MAP_FILE = '%s.map' % self.pedFileBase
99 self.DATA_FILE = '%s.ped' % self.pedFileBase
100 try:
101 os.makedirs(self.outfpath)
102 s = '## made new path %s\n' % self.outfpath
103 except:
104 pass
105 self.lf = file(self.log_file,'w')
106 s = 'PATH=%s\n' % os.environ.get('PATH','?')
107 self.lf.write(s)
108
109 def getRs(self):
110 if self.region > '':
111 useRs = []
112 useRsdict={}
113 try: # TODO make a regexp?
114 c,rest = self.region.split(':')
115 chromosome = c.replace('chr','')
116 rest = rest.replace(',','') # remove commas
117 spos,epos = rest.split('-')
118 spos = int(spos)
119 epos = int(epos)
120 s = '## %s parsing chrom %s from %d to %d\n' % (progname,chromosome,spos,epos)
121 self.lf.write(s)
122 self.lf.write('\n')
123 print >> sys.stdout, s
124 except:
125 s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,self.region)
126 print >> sys.stdout, s
127 self.lf.write(s)
128 self.lf.write('\n')
129 self.lf.close()
130 sys.exit(1)
131 else:
132 useRs = self.orslist.split() # galaxy replaces newlines with XX - go figure
133 useRsdict = dict(zip(useRs,useRs))
134 return useRs, useRsdict
135
136
137 def setupRegions(self):
138 """
139 This turns out to be complex because we allow the user
140 flexibility - paste a list of rs or give a region.
141 In most cases, some subset has to be generated correctly before running Haploview
142 """
143 chromosome = ''
144 spos = epos = -9
145 rslist = []
146 rsdict = {}
147 useRs,useRsdict = self.getRs()
148 self.useTemp = False
149 try:
150 dfile = open(self.DATA_FILE, 'r')
151 except: # bad input file name?
152 s = '##! RGeno unable to open file %s\n' % (self.DATA_FILE)
153 self.lf.write(s)
154 self.lf.write('\n')
155 self.lf.close()
156 print >> sys.stdout, s
157 raise
158 sys.exit(1)
159 try:
160 mfile = open(self.MAP_FILE, 'r')
161 except: # bad input file name?
162 s = '##! RGeno unable to open file %s' % (self.MAP_FILE)
163 lf.write(s)
164 lf.write('\n')
165 lf.close()
166 print >> sys.stdout, s
167 raise
168 sys.exit(1)
169 if len(useRs) > 0 or spos <> -9 : # subset region
170 self.useTemp = True
171 ### Figure out which markers are in this region
172 markers = []
173 snpcols = {}
174 chroms = {}
175 minpos = 2**32
176 maxpos = 0
177 for lnum,row in enumerate(mfile):
178 line = row.strip()
179 if not line: continue
180 chrom, snp, genpos, abspos = line.split()
181 try:
182 ic = int(chrom)
183 except:
184 ic = None
185 if ic and ic <= 23:
186 try:
187 abspos = int(abspos)
188 if abspos > maxpos:
189 maxpos = abspos
190 if abspos < minpos:
191 minpos = abspos
192 except:
193 abspos = epos + 999999999 # so next test fails
194 if useRsdict.get(snp,None) or (spos <> -9 and chrom == chromosome and (spos <= abspos <= epos)):
195 if chromosome == '':
196 chromosome = chrom
197 chroms.setdefault(chrom,chrom)
198 markers.append((chrom,abspos,snp)) # decorate for sort into genomic
199 snpcols[snp] = lnum # so we know which col to find genos for this marker
200 markers.sort()
201 rslist = [x[2] for x in markers] # drop decoration
202 rsdict = dict(zip(rslist,rslist))
203 if len(rslist) == 0:
204 s = '##! %s: Found no rs numbers matching %s' % (progname,self.args[1:3])
205 self.lf.write(s)
206 self.lf.write('\n')
207 self.lf.close()
208 print >> sys.stdout, s
209 sys.exit(1)
210 if spos == -9:
211 spos = minpos
212 epos = maxpos
213 s = '## %s looking for %d rs (%s)' % (progname,len(rslist),rslist[:5])
214 self.lf.write(s)
215 print >> sys.stdout, s
216 wewant = [(6+(2*snpcols[x])) for x in rslist] #
217 # column indices of first geno of each marker pair to get the markers into genomic
218 ### ... and then parse the rest of the ped file to pull out
219 ### the genotypes for all subjects for those markers
220 # /usr/local/galaxy/data/rg/1/lped/
221 self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
222 self.tempMap = file(self.tempMapName,'w')
223 self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
224 self.tempPed = file(self.tempPedName,'w')
225 self.pngpath = '%s.LD.PNG' % self.tempPedName
226 map = ['%s\t%s' % (x[2],x[1]) for x in markers] # snp,abspos in genomic order for haploview
227 self.tempMap.write('%s\n' % '\n'.join(map))
228 self.tempMap.close()
229 nrows = 0
230 for line in dfile:
231 line = line.strip()
232 if not line:
233 continue
234 fields = line.split()
235 preamble = fields[:6]
236 g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
237 g = ' '.join(g)
238 g = g.split() # we'll get there
239 g = [atrandic.get(x,'0') for x in g] # numeric alleles...
240 self.tempPed.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
241 nrows += 1
242 self.tempPed.close()
243 s = '## %s: wrote %d markers, %d subjects for region %s\n' % (progname,len(rslist),nrows,self.region)
244 self.lf.write(s)
245 self.lf.write('\n')
246 print >> sys.stdout,s
247 else: # even if using all, must set up haploview info file instead of map
248 markers = []
249 chroms = {}
250 spos = sys.maxint
251 epos = -spos
252 for lnum,row in enumerate(mfile):
253 line = row.strip()
254 if not line: continue
255 chrom, snp, genpos, abspos = line.split()
256 try:
257 ic = int(chrom)
258 except:
259 ic = None
260 if ic and ic <= 23:
261 if chromosome == '':
262 chromosome = chrom
263 chroms.setdefault(chrom,chrom)
264 try:
265 p = int(abspos)
266 if p < spos and p <> 0:
267 spos = p
268 if p > epos and p <> 0:
269 epos = p
270 except:
271 pass
272 markers.append('%s %s' % (snp,abspos)) # no sort - pass
273 # now have spos and epos for hapmap if hmpanels
274 self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
275 self.tempMap = file(self.tempMapName,'w')
276 self.tempMap.write('\n'.join(markers))
277 self.tempMap.close()
278 self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
279 try: # will fail on winblows!
280 os.symlink(self.DATA_FILE,self.tempPedName)
281 except:
282 shutil.copy(self.DATA_FILE,self.tempPedName) # wasteful but..
283 self.nchroms = len(chroms) # if > 1 can't really do this safely
284 dfile.close()
285 mfile.close()
286 self.spos = spos
287 self.epos = epos
288 self.chromosome = chromosome
289 if self.nchroms > 1:
290 s = '## warning - multiple chromosomes found in your map file - %s\n' % ','.join(chroms.keys())
291 self.lf.write(s)
292 print >> sys.stdout,s
293 sys.exit(1)
294
295 def run(self,vcl):
296 """
297 """
298 p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
299 retval = p.wait()
300 self.lf.write('## executing %s returned %d\n' % (vcl,retval))
301
302 def plotHmPanels(self,ste):
303 """
304 """
305 sp = '%d' % (self.spos/1000.) # hapmap wants kb
306 ep = '%d' % (self.epos/1000.)
307 fnum=0
308 for panel in self.hmpanels:
309 if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
310 ptran = panel.strip()
311 ptran = ptran.replace('+','_')
312 fnum += 1 # preserve an order or else we get sorted
313 vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,
314 '-chromosome',self.chromosome, '-panel',panel.strip(),
315 '-hapmapDownload','-startpos',sp,'-endpos',ep,
316 '-ldcolorscheme',self.ldType]
317 if self.minMaf:
318 vcl += ['-minMaf','%f' % self.minMaf]
319 if self.maxDist:
320 vcl += ['-maxDistance',self.maxDist]
321 if self.hiRes:
322 vcl.append('-png')
323 else:
324 vcl.append('-compressedpng')
325 if self.infotrack:
326 vcl.append('-infoTrack')
327 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
328 retval = p.wait()
329 inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel)
330 inpng = inpng.replace(' ','') # mysterious spaces!
331 outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome)
332 # hack for stupid chb+jpt
333 outpng = outpng.replace(' ','')
334 tmppng = '%s.tmp.png' % self.title
335 tmppng = tmppng.replace(' ','')
336 outpng = os.path.split(outpng)[-1]
337 vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
338 self.run(' '.join(vcl))
339 s = "text 10,300 'HapMap %s'" % ptran.strip()
340 vcl = [self.convert, '-pointsize 25','-fill maroon',
341 '-draw "%s"' % s, tmppng, outpng]
342 self.run(' '.join(vcl))
343 try:
344 os.remove(os.path.join(self.outfpath,tmppng))
345 except:
346 pass
347
348 def doPlots(self):
349 """
350 """
351 DATA_FILE = self.tempPedName # for haploview
352 INFO_FILE = self.tempMapName
353 fblog,blog = tempfile.mkstemp()
354 ste = open(blog,'w') # to catch the blather
355 # if no need to rewrite - set up names for haploview call
356 vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,'-pairwiseTagging',
357 '-pedfile',DATA_FILE,'-info',INFO_FILE,'-tagrsqcounts',
358 '-tagrsqcutoff',self.tagr2, '-ldcolorscheme',self.ldType]
359 if self.minMaf:
360 vcl += ['-minMaf','%f' % self.minMaf]
361 if self.maxDist:
362 vcl += ['-maxDistance',self.maxDist]
363 if self.hiRes:
364 vcl.append('-png')
365 else:
366 vcl.append('-compressedpng')
367 if self.nchroms == 1:
368 vcl += ['-chromosome',self.chromosome]
369 if self.infotrack:
370 vcl.append('-infoTrack')
371 self.run(' '.join(vcl))
372 vcl = [self.mogrify, '-resize 800x400!', '*.PNG']
373 self.run(' '.join(vcl))
374 inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
375 inpng = inpng.replace(' ','')
376 inpng = os.path.split(inpng)[-1]
377 tmppng = '%s.tmp.png' % self.title
378 tmppng = tmppng.replace(' ','')
379 outpng = '1_%s.png' % self.title
380 outpng = outpng.replace(' ','')
381 outpng = os.path.split(outpng)[-1]
382 vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
383 self.run(' '.join(vcl))
384 s = "text 10,300 '%s'" % self.title[:40]
385 vcl = [self.convert, '-pointsize 25','-fill maroon',
386 '-draw "%s"' % s, tmppng, outpng]
387 self.run(' '.join(vcl))
388 try:
389 os.remove(os.path.join(self.outfpath,tmppng))
390 except:
391 pass # label all the plots then delete all the .PNG files before munging
392 fnum=1
393 if self.hmpanels:
394 self.plotHmPanels(ste)
395 nimages = len(glob.glob(os.path.join(self.outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
396 self.lf.write('### nimages=%d\n' % nimages)
397 if nimages > 0: # haploview may fail?
398 vcl = '%s -format pdf -resize 800x400! *.png' % self.mogrify
399 self.run(vcl)
400 vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % self.pdfjoin
401 self.run(vcl)
402 vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (self.pdfnup,nimages)
403 self.run(vcl)
404 vcl = '%s -resize x300 allnup.pdf allnup.png' % (self.convert)
405 self.run(vcl)
406 ste.close() # temp file used to catch haploview blather
407 hblather = open(blog,'r').readlines() # to catch the blather
408 os.unlink(blog)
409 if len(hblather) > 0:
410 self.lf.write('## In addition, Haploview complained:')
411 self.lf.write(''.join(hblather))
412 self.lf.write('\n')
413 self.lf.close()
414
415 def writeHtml(self):
416 """
417 """
418 flist = glob.glob(os.path.join(self.outfpath, '*'))
419 flist.sort()
420 ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
421 ftran = string.maketrans(ts,'_'*len(ts))
422 outf = file(self.outfile,'w')
423 outf.write(galhtmlprefix % progname)
424 s = '<h4>rgenetics for Galaxy %s, wrapping HaploView</h4>' % (progname)
425 outf.write(s)
426 mainthumb = 'allnup.png'
427 mainpdf = 'allnup.pdf'
428 if os.path.exists(os.path.join(self.outfpath,mainpdf)):
429 if not os.path.exists(os.path.join(self.outfpath,mainthumb)):
430 outf.write('<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\n' % (mainpdf))
431 else:
432 outf.write('<table><tr><td><a href="%s"><img src="%s" title="Main combined LD image" hspace="10" align="middle">' % (mainpdf,mainthumb))
433 outf.write('</td><td>Click the thumbnail at left to download the main combined LD image <a href=%s>%s</a></td></tr></table>\n' % (mainpdf,mainpdf))
434 else:
435 outf.write('(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\n')
436 outf.write('<br><div><hr><ul>\n')
437 for i, data in enumerate( flist ):
438 dn = os.path.split(data)[-1]
439 if dn[:3] <> 'all':
440 continue
441 newdn = dn.translate(ftran)
442 if dn <> newdn:
443 os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
444 dn = newdn
445 dnlabel = dn
446 ext = dn.split('.')[-1]
447 if dn == 'allnup.pdf':
448 dnlabel = 'All pdf plots on a single page'
449 elif dn == 'alljoin.pdf':
450 dnlabel = 'All pdf plots, each on a separate page'
451 outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
452 for i, data in enumerate( flist ):
453 dn = os.path.split(data)[-1]
454 if dn[:3] == 'all':
455 continue
456 newdn = dn.translate(ftran)
457 if dn <> newdn:
458 os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
459 dn = newdn
460 dnlabel = dn
461 ext = dn.split('.')[-1]
462 if dn == 'allnup.pdf':
463 dnlabel = 'All pdf plots on a single page'
464 elif dn == 'alljoin.pdf':
465 dnlabel = 'All pdf plots, each on a separate page'
466 elif ext == 'info':
467 dnlabel = '%s map data for Haploview input' % self.title
468 elif ext == 'ped':
469 dnlabel = '%s genotype data for Haploview input' % self.title
470 elif dn.find('CEU') <> -1 or dn.find('YRI') <> -1 or dn.find('CHB_JPT') <> -1: # is hapmap
471 dnlabel = 'Hapmap data'
472 if ext == 'TAGS' or ext == 'TESTS' or ext == 'CHAPS':
473 dnlabel = dnlabel + ' Tagger output'
474 outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
475 outf.write('</ol><br>')
476 outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % self.logfn)
477 s = file(self.log_file,'r').readlines()
478 s = '\n'.join(s)
479 outf.write('%s</pre><hr></div>' % s)
480 outf.write('</body></html>')
481 outf.close()
482 if self.useTemp:
483 try:
484 os.unlink(self.tempMapName)
485 os.unlink(self.tempPedName)
486 except:
487 pass
488
489 if __name__ == "__main__":
490 """ ### Sanity check the arguments
491
492 <command interpreter="python">
493 rgHaploView.py "$ucsc_region" "$rslist" "$title" "$out_file1"
494 "$lhistIn.extra_files_path" "$lhistIn.metadata.base_name"
495 "$minmaf" "$maxdist" "$ldtype" "$hires" "$memsize" "$out_file1.files_path"
496 "$infoTrack" "$tagr2" "$hmpanel" ${GALAXY_DATA_INDEX_DIR}/rg/bin/haploview.jar
497 </command>
498
499 remember to figure out chromosome and complain if > 1?
500 and use the -chromosome <1-22,X,Y> parameter to haploview
501 skipcheck?
502 """
503 progname = os.path.split(sys.argv[0])[-1]
504 if len(sys.argv) < 16:
505 s = '##!%s: Expected 16 params in sys.argv, got %d (%s)' % (progname,len(sys.argv), sys.argv)
506 print s
507 sys.exit(1)
508 ld = ldPlot(argv = sys.argv)
509 ld.doPlots()
510 ld.writeHtml()
511
512
513