comparison tools/rgenetics/rgQC.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 # oct 15 rpy replaced - temp fix until we get gnuplot working
2 # rpy deprecated - replace with RRun
3 # fixes to run functional test! oct1 2009
4 # needed to expand our path with os.path.realpath to get newpath working with
5 # all the fancy pdfnup stuff
6 # and a fix to pruneld to write output to where it should be
7 # smallish data in test-data/smallwga in various forms
8 # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
9 # child files are deprecated and broken as at july 19 2009
10 # need to move them to the html file extrafiles path
11 # found lots of corner cases with some illumina data where cnv markers were
12 # included
13 # and where affection status was all missing !
14 # added links to tab files showing worst 1/keepfrac markers and subjects
15 # ross lazarus january 2008
16 #
17 # added named parameters
18 # to ensure no silly slippages if non required parameter in the most general case
19 # some potentially useful things here reusable in complex scripts
20 # with lots'o'html (TM)
21 # aug 17 2007 rml
22 #
23 # added marker and subject and parenting april 14 rml
24 # took a while to get the absolute paths right for all the file munging
25 # as of april 16 seems to work..
26 # getting galaxy to serve images in html reports is a little tricky
27 # we don't want QC reports to be dozens of individual files, so need
28 # to use the url /static/rg/... since galaxy's web server will happily serve images
29 # from there
30 # galaxy passes output files as relative paths
31 # these have to be munged by rgQC.py before calling this
32 # galaxy will pass in 2 file names - one for the log
33 # and one for the final html report
34 # of the form './database/files/dataset_66.dat'
35 # we need to be working in that directory so our plink output files are there
36 # so these have to be munged by rgQC.py before calling this
37 # note no ped file passed so had to remove the -l option
38 # for plinkParse.py that makes a heterozygosity report from the ped
39 # file - needs fixing...
40 # new: importing manhattan/qqplot plotter
41 # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
42 # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
43 # contains some R scripts as text strings - we substitute defaults into the calls
44 # to make them do our bidding - and save the resulting code for posterity
45 # this can be called externally, I guess...for QC eg?
46 # """
47 #
48 # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
49 # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
50 # return rlog,flist
51
52
53 from optparse import OptionParser
54
55 import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
56 from os.path import abspath
57 from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
58 import rgManQQ
59
60 prog = os.path.split(sys.argv[0])[1]
61 vers = '0.4 april 2009 rml'
62 idjoiner = '_~_~_' # need something improbable..
63 # many of these may need fixing for a new install
64
65 myversion = vers
66 keepfrac = 20 # fraction to keep after sorting by each interesting value
67
68 missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
69
70 mogresize = "x300" # this controls the width for jpeg thumbnails
71
72
73
74
75 def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
76 """
77 marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
78 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
79 subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
80 """
81
82
83 def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
84 """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
85 # generic histogram and vertical boxplot in a 3:1 layout
86 # returns the graphic file name for inclusion in the web page
87 # broken out here for reuse
88 # ross lazarus march 2007
89 """
90 screenmat = (1,2,1,2) # create a 2x2 cabvas
91 widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
92 rpy.r.pdf( outfname, height , width )
93 #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
94 m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
95 # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
96 rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
97 maint = 'QC for %s - %s' % (basename,title)
98 rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
99 rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
100 rpy.r.dev_off()
101
102 def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
103 """
104 Useful to see what various cutoffs yield - plot percentiles
105 """
106 n = len(plotme)
107 maxveclen = 1000.0 # for reasonable pdf sizes!
108 yvec = copy.copy(plotme)
109 # arrives already in decending order of importance missingness or mendel count by subj or marker
110 xvec = range(n)
111 xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
112 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
113 if n > maxveclen: # oversample part of the distribution
114 always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
115 skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
116 samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
117 yvec = [yvec[i] for i in samplei] # always get first and last
118 xvec = [xvec[i] for i in samplei] # always get first and last
119 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
120 rpy.r.pdf( outfname, height , width )
121 maint = 'QC for %s - %s' % (basename,title)
122 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
123 rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
124 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
125 rpy.r.dev_off()
126
127 def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
128 """
129 y is data for a qq plot and ends up on the x axis go figure
130 if sampling, oversample low values - all the top 1% ?
131 this version called with -log10 transformed hwe
132 """
133 nrows = len(plotme)
134 fn = float(nrows)
135 xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
136 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
137 maxveclen = 3000
138 yvec = copy.copy(plotme)
139 if nrows > maxveclen:
140 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
141 # oversample part of the distribution
142 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
143 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
144 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
145 # always oversample first sorted (here lowest) values
146 yvec = [yvec[i] for i in samplei] # always get first and last
147 xvec = [xvec[i] for i in samplei] # and sample xvec same way
148 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
149 else:
150 maint='Log QQ Plot(n=%d)' % (nrows)
151 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
152 ylab = '%s' % xlabname
153 xlab = '-log10(Uniform 0-1)'
154 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
155 rpy.r.pdf( outfname, height , width )
156 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
157 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
158 rpy.r.points(mx,mx,type='l')
159 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
160 rpy.r.dev_off()
161
162 def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
163 """
164 data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
165 grid of qq plots to show departure from null at extremes of data quality
166 Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
167 and ends up on the x axis go figure
168 if sampling, oversample low values - all the top 1% ?
169 """
170 data = copy.copy(plotme)
171 nvals = len(data)
172 stepsize = nvals/nsplits
173 logstep = math.log10(stepsize) # so is 3 for steps of 1000
174 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
175 data.sort(key=itertools.itemgetter(1)) # into x order
176 rpy.r.pdf( outfname, height , width )
177 rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
178 yvec = [-math.log10(random.random()) for x in range(stepsize)]
179 yvec.sort() # size of each step is expected range for xvec under null?!
180 for rowstart in quints:
181 rowend = rowstart + stepsize
182 if nvals - rowend < stepsize: # finish last split
183 rowend = nvals
184 row = data[rowstart:rowend]
185 row.sort(key=itertools.itemgetter(2)) # into y order
186 for colstart in quints:
187 colend = colstart + stepsize
188 if nvals - colend < stepsize: # finish last split
189 colend = nvals
190 cell = row[colstart:colend]
191 xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
192 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
193 rpy.r.points(c(0,logstep),c(0,logstep),type='l')
194 rpy.r.dev_off()
195
196
197 def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
198 """
199 y is data for a qqnorm plot
200 if sampling, oversample low values - all the top 1% ?
201 """
202 rangeunif = len(plotme)
203 nunif = 1000
204 maxveclen = 3000
205 nrows = len(plotme)
206 data = copy.copy(plotme)
207 if nrows > maxveclen:
208 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
209 # oversample part of the distribution
210 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
211 skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
212 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
213 # always oversample first sorted (here lowest) values
214 yvec = [data[i] for i in samplei] # always get first and last
215 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
216 else:
217 yvec = data
218 maint='Log QQ Plot(n=%d)' % (nrows)
219 n = 1000
220 ylab = '%s' % xlabname
221 xlab = 'Normal'
222 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
223 rpy.r.pdf( outfname, height , width )
224 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
225 rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
226 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
227 rpy.r.dev_off()
228
229 def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
230 """
231 layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
232 like the GAIN qc tools
233 y is data for a qq plot and ends up on the x axis go figure
234 if sampling, oversample low values - all the top 1% ?
235 """
236 rangeunif = len(plotme)
237 nunif = 1000
238 fn = float(rangeunif)
239 xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
240 skip = max(int(rangeunif/fn),1)
241 # force include last points
242 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
243 maxveclen = 2000
244 nrows = len(plotme)
245 data = copy.copy(plotme)
246 data.sort() # low to high - oversample low values
247 if nrows > maxveclen:
248 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
249 # oversample part of the distribution
250 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
251 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
252 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
253 # always oversample first sorted (here lowest) values
254 yvec = [data[i] for i in samplei] # always get first and last
255 xvec = [xvec[i] for i in samplei] # and sample xvec same way
256 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
257 else:
258 yvec = data
259 maint='Log QQ Plot(n=%d)' % (nrows)
260 n = 1000
261 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
262 ylab = '%s' % xlabname
263 xlab = '-log10(Uniform 0-1)'
264 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
265 rpy.r.pdf( outfname, height , width )
266 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
267 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
268 rpy.r.points(mx,mx,type='l')
269 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
270 rpy.r.dev_off()
271
272
273 fdsto,stofile = tempfile.mkstemp()
274 sto = open(stofile,'w')
275 import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
276 mog = 'mogrify'
277 pdfnup = 'pdfnup'
278 pdfjoin = 'pdfjoin'
279 shead = subjects.pop(0) # get rid of head
280 mhead = markers.pop(0)
281 maf = mhead.index('maf')
282 missfrac = mhead.index('missfrac')
283 logphweall = mhead.index('logp_hwe_all')
284 logphweunaff = mhead.index('logp_hwe_unaff')
285 # check for at least some unaffected rml june 2009
286 m_mendel = mhead.index('N_Mendel')
287 fracmiss = shead.index('FracMiss')
288 s_mendel = shead.index('Mendel_errors')
289 s_het = shead.index('F_Stat')
290 params = {}
291 hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
292 and x[logphweunaff].upper() <> 'NA']
293 if len(hweres) <> 0:
294 xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
295 # plot for each of these cols
296 else: # try hwe all instead - maybe no affection status available
297 xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
298 ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
299 oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
300 qqplotme = [1,0,0,0,0,0,0] #
301 qnplotme = [0,0,0,0,0,0,1] #
302 nplots = len(xs)
303 xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
304 'Marker Mendel Error Count', 'Missing Rate: Subjects',
305 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
306 plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
307 ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
308 ordplotnames = ['%s_cum' % x for x in plotnames]
309 ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
310 outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
311 ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
312 datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
313 titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
314 "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
315 html = []
316 pdflist = []
317 for n,column in enumerate(xs):
318 dat = [float(x[column]) for x in datasources[n] if len(x) >= column
319 and x[column][:2].upper() <> 'NA'] # plink gives both!
320 if sum(dat) <> 0: # eg nada for mendel if case control?
321 rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
322 title=titles[n],basename=basename,nbreaks=nbreaks)
323 row = [titles[n],ploturls[n],outfnames[n] ]
324 html.append(row)
325 pdflist.append(outfnames[n])
326 if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
327 otitle = 'Ranked %s' % titles[n]
328 dat.sort()
329 if oreverseme[n]:
330 dat.reverse()
331 rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
332 title=otitle,basename=basename,nbreaks=1000)
333 row = [otitle,ordploturls[n],ordoutfnames[n]]
334 html.append(row)
335 pdflist.append(ordoutfnames[n])
336 if qqplotme[n]: #
337 otitle = 'LogQQ plot %s' % titles[n]
338 dat.sort()
339 dat.reverse()
340 ofn = os.path.split(ordoutfnames[n])
341 ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
342 ofu = os.path.split(ordploturls[n])
343 ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
344 rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
345 title=otitle,basename=basename)
346 row = [otitle,ofu,ofn]
347 html.append(row)
348 pdflist.append(ofn)
349 elif qnplotme[n]:
350 otitle = 'F Statistic %s' % titles[n]
351 dat.sort()
352 dat.reverse()
353 ofn = os.path.split(ordoutfnames[n])
354 ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
355 ofu = os.path.split(ordploturls[n])
356 ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
357 rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
358 title=otitle,basename=basename)
359 row = [otitle,ofu,ofn]
360 html.append(row)
361 pdflist.append(ofn)
362 else:
363 print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
364 if nup>0:
365 # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
366 # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
367 filestojoin = ' '.join(pdflist) # all the file names so far
368 afname = '%s_All_Paged.pdf' % (basename)
369 fullafname = os.path.join(newfpath,afname)
370 expl = 'All %s QC Plots joined into a single pdf' % basename
371 vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
372 # make single page pdf
373 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
374 retval = x.wait()
375 row = [expl,afname,fullafname]
376 html.insert(0,row) # last rather than second
377 nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
378 fullnfname = os.path.join(newfpath,nfname)
379 expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
380 vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
381 # make thumbnail images
382 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
383 retval = x.wait()
384 row = [expl,nfname,fullnfname]
385 html.insert(1,row) # this goes second
386 vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
387 # make thumbnail images
388 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
389 retval = x.wait()
390 sto.close()
391 cruft = open(stofile,'r').readlines()
392 return html,cruft # elements for an ordered list of urls or whatever..
393
394
395 def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
396 """
397 nice try but the R scripts are huge and take forever to run if there's a lot of data
398 marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
399 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
400 subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
401 """
402 colour = "maroon"
403
404 def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
405 """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
406 # generic histogram and vertical boxplot in a 3:1 layout
407 # returns the graphic file name for inclusion in the web page
408 # broken out here for reuse
409 # ross lazarus march 2007
410 """
411 R = []
412 maint = 'QC for %s - %s' % (basename,title)
413 screenmat = (1,2,1,2) # create a 2x2 canvas
414 widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
415 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
416 R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
417 R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
418 R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
419 R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
420 R.append('dev.off()')
421 return R
422
423 def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
424 """
425 Useful to see what various cutoffs yield - plot percentiles
426 """
427 R = []
428 n = len(plotme)
429 R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
430 # arrives already in decending order of importance missingness or mendel count by subj or marker
431 maint = 'QC for %s - %s' % (basename,title)
432 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
433 R.append("par(lab=c(10,10,10))")
434 R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
435 R.append('dev.off()')
436 return R
437
438 def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
439 """
440 y is data for a qq plot and ends up on the x axis go figure
441 if sampling, oversample low values - all the top 1% ?
442 this version called with -log10 transformed hwe
443 """
444 R = []
445 nrows = len(plotme)
446 fn = float(nrows)
447 xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
448 #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
449 maxveclen = 3000
450 yvec = copy.copy(plotme)
451 if nrows > maxveclen:
452 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
453 # oversample part of the distribution
454 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
455 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
456 if skip < 2:
457 skip = 2
458 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
459 # always oversample first sorted (here lowest) values
460 yvec = [yvec[i] for i in samplei] # always get first and last
461 xvec = [xvec[i] for i in samplei] # and sample xvec same way
462 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
463 else:
464 maint='Log QQ Plot(n=%d)' % (nrows)
465 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
466 ylab = '%s' % xlabname
467 xlab = '-log10(Uniform 0-1)'
468 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
469 x = ['%f' % x for x in xvec]
470 R.append('xvec = c(%s)' % ','.join(x))
471 y = ['%f' % x for x in yvec]
472 R.append('yvec = c(%s)' % ','.join(y))
473 R.append('mx = c(0,%f)' % (math.log10(fn)))
474 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
475 R.append("par(lab=c(10,10,10))")
476 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
477 R.append('points(mx,mx,type="l")')
478 R.append('grid(col="lightgray",lty="dotted")')
479 R.append('dev.off()')
480 return R
481
482 def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
483 """
484 data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
485 grid of qq plots to show departure from null at extremes of data quality
486 Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
487 and ends up on the x axis go figure
488 if sampling, oversample low values - all the top 1% ?
489 """
490 data = copy.copy(plotme)
491 nvals = len(data)
492 stepsize = nvals/nsplits
493 logstep = math.log10(stepsize) # so is 3 for steps of 1000
494 R.append('mx = c(0,%f)' % logstep)
495 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
496 data.sort(key=itertools.itemgetter(1)) # into x order
497 #rpy.r.pdf( outfname, h , w )
498 #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
499 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
500 yvec = [-math.log10(random.random()) for x in range(stepsize)]
501 yvec.sort() # size of each step is expected range for xvec under null?!
502 y = ['%f' % x for x in yvec]
503 R.append('yvec = c(%s)' % ','.join(y))
504 for rowstart in quints:
505 rowend = rowstart + stepsize
506 if nvals - rowend < stepsize: # finish last split
507 rowend = nvals
508 row = data[rowstart:rowend]
509 row.sort(key=itertools.itemgetter(2)) # into y order
510 for colstart in quints:
511 colend = colstart + stepsize
512 if nvals - colend < stepsize: # finish last split
513 colend = nvals
514 cell = row[colstart:colend]
515 xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
516 x = ['%f' % x for x in xvec]
517 R.append('xvec = c(%s)' % ','.join(x))
518 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
519 R.append('points(mx,mx,type="l")')
520 R.append('grid(col="lightgray",lty="dotted")')
521 #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
522 #rpy.r.points(c(0,logstep),c(0,logstep),type='l')
523 R.append('dev.off()')
524 #rpy.r.dev_off()
525 return R
526
527
528 def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
529 """
530 y is data for a qqnorm plot
531 if sampling, oversample low values - all the top 1% ?
532 """
533 rangeunif = len(plotme)
534 nunif = 1000
535 maxveclen = 3000
536 nrows = len(plotme)
537 data = copy.copy(plotme)
538 if nrows > maxveclen:
539 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
540 # oversample part of the distribution
541 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
542 skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
543 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
544 # always oversample first sorted (here lowest) values
545 yvec = [data[i] for i in samplei] # always get first and last
546 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
547 else:
548 yvec = data
549 maint='Log QQ Plot(n=%d)' % (nrows)
550 n = 1000
551 ylab = '%s' % xlabname
552 xlab = 'Normal'
553 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
554 #rpy.r.pdf( outfname, h , w )
555 #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
556 #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
557 #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
558 #rpy.r.dev_off()
559 y = ['%f' % x for x in yvec]
560 R.append('yvec = c(%s)' % ','.join(y))
561 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
562 R.append("par(lab=c(10,10,10))")
563 R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
564 R.append('grid(col="lightgray",lty="dotted")')
565 R.append('dev.off()')
566 return R
567
568 def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
569 """
570 layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
571 like the GAIN qc tools
572 y is data for a qq plot and ends up on the x axis go figure
573 if sampling, oversample low values - all the top 1% ?
574 """
575 rangeunif = len(plotme)
576 nunif = 1000
577 fn = float(rangeunif)
578 xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
579 skip = max(int(rangeunif/fn),1)
580 # force include last points
581 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
582 maxveclen = 2000
583 nrows = len(plotme)
584 data = copy.copy(plotme)
585 data.sort() # low to high - oversample low values
586 if nrows > maxveclen:
587 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
588 # oversample part of the distribution
589 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
590 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
591 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
592 # always oversample first sorted (here lowest) values
593 yvec = [data[i] for i in samplei] # always get first and last
594 xvec = [xvec[i] for i in samplei] # and sample xvec same way
595 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
596 else:
597 yvec = data
598 maint='Log QQ Plot(n=%d)' % (nrows)
599 n = 1000
600 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
601 ylab = '%s' % xlabname
602 xlab = '-log10(Uniform 0-1)'
603 R.append('mx = c(0,%f)' % (math.log10(fn)))
604 x = ['%f' % x for x in xvec]
605 R.append('xvec = c(%s)' % ','.join(x))
606 y = ['%f' % x for x in yvec]
607 R.append('yvec = c(%s)' % ','.join(y))
608 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
609 R.append("par(lab=c(10,10,10))")
610 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
611 R.append('points(mx,mx,type="l")')
612 R.append('grid(col="lightgray",lty="dotted")')
613 R.append('dev.off()')
614
615
616 shead = subjects.pop(0) # get rid of head
617 mhead = markers.pop(0)
618 maf = mhead.index('maf')
619 missfrac = mhead.index('missfrac')
620 logphweall = mhead.index('logp_hwe_all')
621 logphweunaff = mhead.index('logp_hwe_unaff')
622 # check for at least some unaffected rml june 2009
623 m_mendel = mhead.index('N_Mendel')
624 fracmiss = shead.index('FracMiss')
625 s_mendel = shead.index('Mendel_errors')
626 s_het = shead.index('F_Stat')
627 params = {}
628 h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
629 and x[logphweunaff].upper() <> 'NA']
630 if len(h) <> 0:
631 xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
632 # plot for each of these cols
633 else: # try hwe all instead - maybe no affection status available
634 xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
635 ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
636 oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
637 qqplotme = [1,0,0,0,0,0,0] #
638 qnplotme = [0,0,0,0,0,0,1] #
639 nplots = len(xs)
640 xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
641 'Marker Mendel Error Count', 'Missing Rate: Subjects',
642 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
643 plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
644 ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
645 ordplotnames = ['%s_cum' % x for x in plotnames]
646 ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
647 outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
648 ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
649 datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
650 titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
651 "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
652 html = []
653 pdflist = []
654 R = []
655 for n,column in enumerate(xs):
656 dfn = '%d_%s.txt' % (n,titles[n])
657 dfilepath = os.path.join(newfpath,dfn)
658 dat = [float(x[column]) for x in datasources[n] if len(x) >= column
659 and x[column][:2].upper() <> 'NA'] # plink gives both!
660 if sum(dat) <> 0: # eg nada for mendel if case control?
661 plotme = file(dfilepath,'w')
662 plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
663 tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
664 title=titles[n],basename=basename,nbreaks=nbreaks)
665 R += tR
666 row = [titles[n],ploturls[n],outfnames[n] ]
667 html.append(row)
668 pdflist.append(outfnames[n])
669 if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
670 otitle = 'Ranked %s' % titles[n]
671 dat.sort()
672 if oreverseme[n]:
673 dat.reverse()
674 ndat = len(dat)
675 xvec = range(ndat)
676 xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
677 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
678 maxveclen = 1000.0 # for reasonable pdf sizes!
679 if ndat > maxveclen: # oversample part of the distribution
680 always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
681 skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
682 samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
683 yvec = [yvec[i] for i in samplei] # always get first and last
684 xvec = [xvec[i] for i in samplei] # always get first and last
685 plotme = file(dfilepath,'w')
686 plotme.write('xvec\tyvec\n')
687 plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
688 tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
689 title=otitle,basename=basename,nbreaks=nbreaks)
690 R += tR
691 row = [otitle,ordploturls[n],ordoutfnames[n]]
692 html.append(row)
693 pdflist.append(ordoutfnames[n])
694 if qqplotme[n]: #
695 otitle = 'LogQQ plot %s' % titles[n]
696 dat.sort()
697 dat.reverse()
698 ofn = os.path.split(ordoutfnames[n])
699 ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
700 ofu = os.path.split(ordploturls[n])
701 ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
702 tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
703 title=otitle,basename=basename)
704 R += tR
705 row = [otitle,ofu,ofn]
706 html.append(row)
707 pdflist.append(ofn)
708 elif qnplotme[n]:
709 otitle = 'F Statistic %s' % titles[n]
710 dat.sort()
711 dat.reverse()
712 ofn = os.path.split(ordoutfnames[n])
713 ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
714 ofu = os.path.split(ordploturls[n])
715 ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
716 tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
717 title=otitle,basename=basename)
718 R += tR
719 row = [otitle,ofu,ofn]
720 html.append(row)
721 pdflist.append(ofn)
722 else:
723 print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
724 rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
725 if nup>0:
726 # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
727 # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
728 filestojoin = ' '.join(pdflist) # all the file names so far
729 afname = '%s_All_Paged.pdf' % (basename)
730 fullafname = os.path.join(newfpath,afname)
731 expl = 'All %s QC Plots joined into a single pdf' % basename
732 vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
733 # make single page pdf
734 x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
735 retval = x.wait()
736 row = [expl,afname,fullafname]
737 html.insert(0,row) # last rather than second
738 nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
739 fullnfname = os.path.join(newfpath,nfname)
740 expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
741 vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
742 # make thumbnail images
743 x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
744 retval = x.wait()
745 row = [expl,nfname,fullnfname]
746 html.insert(1,row) # this goes second
747 vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
748 # make thumbnail images
749 x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
750 retval = x.wait()
751 return html # elements for an ordered list of urls or whatever..
752
753 def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
754 """
755 NO LONGER USED - historical interest
756 count het loci for each subject to look for outliers = ? contamination
757 assume ped file is linkage format
758 need to make a ped file from the bed file we were passed
759 """
760 vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
761 p=subprocess.Popen(' '.join(vcl),shell=True)
762 retval = p.wait()
763 f = open('%s_recode.recode.ped' % froot,'r')
764 if not linkageped:
765 head = f.next() # throw away header
766 hets = [] # simple count of het loci per subject. Expect poisson?
767 n = 1
768 for l in f:
769 n += 1
770 ll = l.strip().split()
771 if len(ll) > 6:
772 iid = idjoiner.join(ll[:2]) # fam_iid
773 gender = ll[4]
774 alleles = ll[6:]
775 nallele = len(alleles)
776 nhet = 0
777 for i in range(nallele/2):
778 a1=alleles[2*i]
779 a2=alleles[2*i+1]
780 if alleles[2*i] <> alleles[2*i+1]: # must be het
781 if not missvals.get(a1,None) and not missvals.get(a2,None):
782 nhet += 1
783 hets.append((nhet,iid,gender)) # for sorting later
784 f.close()
785 hets.sort()
786 hets.reverse() # biggest nhet now on top
787 f = open(outfname ,'w')
788 res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
789 res.insert(0,'nhetloci\tfamid_iid\tgender')
790 res.append('')
791 f.write('\n'.join(res))
792 f.close()
793
794
795
796 def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
797 """by subject (missingness = .imiss, mendel = .imendel)
798 assume replicates have an underscore in family id for
799 hapmap testing
800 For sorting, we need floats and integers
801 """
802 isexfile = '%s.sexcheck' % froot
803 imissfile = '%s.imiss' % froot
804 imendfile = '%s.imendel' % froot
805 ihetfile = '%s.het' % froot
806 logf.write('## subject reports starting at %s\n' % timenow())
807 outfile = os.path.join(newfpath,outfname)
808 idlist = []
809 imissdict = {}
810 imenddict = {}
811 isexdict = {}
812 ihetdict = {}
813 Tops = {}
814 Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
815 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
816 Tsorts = [2,3,6,8]
817 Treverse = [True,True,True,False] # so first values are worser
818 #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
819 ## FID IID MISS_PHENO N_MISS N_GENO F_MISS
820 ## 1552042370_A 1552042370_A N 5480 549883 0.009966
821 ## 1552042410_A 1552042410_A N 1638 549883 0.002979
822
823 # ------------------missing--------------------
824 # imiss has FID IID MISS_PHENO N_MISS F_MISS
825 # we want F_MISS
826 try:
827 f = open(imissfile,'r')
828 except:
829 logf.write('# file %s is missing - talk about irony\n' % imissfile)
830 f = None
831 if f:
832 for n,line in enumerate(f):
833 ll = line.strip().split()
834 if n == 0:
835 head = [x.upper() for x in ll] # expect above
836 fidpos = head.index('FID')
837 iidpos = head.index('IID')
838 fpos = head.index('F_MISS')
839 elif len(ll) >= fpos: # full line
840 fid = ll[fidpos]
841 #if fid.find('_') == -1: # not replicate! - icondb ids have these...
842 iid = ll[iidpos]
843 fmiss = ll[fpos]
844 id = '%s%s%s' % (fid,idjoiner,iid)
845 imissdict[id] = fmiss
846 idlist.append(id)
847 f.close()
848 logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
849 # ------------------mend-------------------
850 # *.imendel has FID IID N
851 # we want N
852 gotmend = True
853 try:
854 f = open(imendfile,'r')
855 except:
856 gotmend = False
857 for id in idlist:
858 imenddict[id] = '0'
859 if gotmend:
860 for n,line in enumerate(f):
861 ll = line.strip().split()
862 if n == 0:
863 head = [x.upper() for x in ll] # expect above
864 npos = head.index('N')
865 fidpos = head.index('FID')
866 iidpos = head.index('IID')
867 elif len(ll) >= npos: # full line
868 fid = ll[fidpos]
869 iid = ll[iidpos]
870 id = '%s%s%s' % (fid,idjoiner,iid)
871 nmend = ll[npos]
872 imenddict[id] = nmend
873 f.close()
874 else:
875 logf.write('## error No %s file - assuming not family data\n' % imendfile)
876 # ------------------sex check------------------
877 #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
878 # sexcheck has FID IID PEDSEX SNPSEX STATUS F
879 ##
880 ## FID Family ID
881 ## IID Individual ID
882 ## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
883 ## SNPSEX Sex as determined by X chromosome
884 ## STATUS Displays "PROBLEM" or "OK" for each individual
885 ## F The actual X chromosome inbreeding (homozygosity) estimate
886 ##
887 ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
888 ## ambiguous with regard to sex.
889 ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
890 try:
891 f = open(isexfile,'r')
892 got_sexcheck = 1
893 except:
894 got_sexcheck = 0
895 if got_sexcheck:
896 for n,line in enumerate(f):
897 ll = line.strip().split()
898 if n == 0:
899 head = [x.upper() for x in ll] # expect above
900 fidpos = head.index('FID')
901 iidpos = head.index('IID')
902 pedsexpos = head.index('PEDSEX')
903 snpsexpos = head.index('SNPSEX')
904 statuspos = head.index('STATUS')
905 fpos = head.index('F')
906 elif len(ll) >= fpos: # full line
907 fid = ll[fidpos]
908 iid = ll[iidpos]
909 fest = ll[fpos]
910 pedsex = ll[pedsexpos]
911 snpsex = ll[snpsexpos]
912 stat = ll[statuspos]
913 id = '%s%s%s' % (fid,idjoiner,iid)
914 isexdict[id] = (pedsex,snpsex,stat,fest)
915 f.close()
916 else:
917 # this only happens if there are no subjects!
918 logf.write('No %s file - assuming no sex errors' % isexfile)
919 ##
920 ## FID IID O(HOM) E(HOM) N(NM) F
921 ## 457 2 490665 4.928e+05 722154 -0.009096
922 ## 457 3 464519 4.85e+05 710986 -0.0908
923 ## 1037 2 461632 4.856e+05 712025 -0.106
924 ## 1037 1 491845 4.906e+05 719353 0.005577
925 try:
926 f = open(ihetfile,'r')
927 except:
928 f = None
929 logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
930 if f:
931 for i,line in enumerate(f):
932 ll = line.strip().split()
933 if i == 0:
934 head = [x.upper() for x in ll] # expect above
935 fidpos = head.index('FID')
936 iidpos = head.index('IID')
937 fpos = head.index('F')
938 n = 0
939 elif len(ll) >= fpos: # full line
940 fid = ll[fidpos]
941 iid = ll[iidpos]
942 fhet = ll[fpos]
943 id = '%s%s%s' % (fid,idjoiner,iid)
944 ihetdict[id] = fhet
945 f.close() # now assemble and output result list
946 rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
947 res = []
948 fres = [] # floats for sorting
949 for id in idlist: # for each snp in found order
950 fid,iid = id.split(idjoiner) # recover keys
951 f_missing = imissdict.get(id,'0.0')
952 nmend = imenddict.get(id,'0')
953 (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
954 fhet = ihetdict.get(id,'0.0')
955 res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
956 try:
957 ff_missing = float(f_missing)
958 except:
959 ff_missing = 0.0
960 try:
961 inmend = int(nmend)
962 except:
963 inmend = 0
964 try:
965 ffest = float(fest)
966 except:
967 fest = 0.0
968 try:
969 ffhet = float(fhet)
970 except:
971 ffhet = 0.0
972 fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
973 ntokeep = max(20,len(res)/keepfrac)
974 for i,col in enumerate(Tsorts):
975 fres.sort(key=operator.itemgetter(col))
976 if Treverse[i]:
977 fres.reverse()
978 repname = Tnames[i]
979 Tops[repname] = fres[0:ntokeep]
980 Tops[repname] = [map(str,x) for x in Tops[repname]]
981 Tops[repname].insert(0,rhead)
982 res.sort()
983 res.insert(0,rhead)
984 logf.write('### writing %s report with %s' % ( outfile,res[0]))
985 f = open(outfile,'w')
986 f.write('\n'.join(['\t'.join(x) for x in res]))
987 f.write('\n')
988 f.close()
989 return res,Tops
990
991 def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
992 """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
993 keep a list of marker order but keep all stats in dicts
994 write out a fake xls file for R or SAS etc
995 kinda clunky, but..
996 TODO: ensure stable if any file not found?
997 """
998 mapdict = {}
999 if maplist <> None:
1000 rslist = [x[1] for x in maplist]
1001 offset = [(x[0],x[3]) for x in maplist]
1002 mapdict = dict(zip(rslist,offset))
1003 hwefile = '%s.hwe' % froot
1004 lmissfile = '%s.lmiss' % froot
1005 freqfile = '%s.frq' % froot
1006 lmendfile = '%s.lmendel' % froot
1007 outfile = os.path.join(newfpath,outfname)
1008 markerlist = []
1009 chromlist = []
1010 hwedict = {}
1011 lmissdict = {}
1012 freqdict = {}
1013 lmenddict = {}
1014 Tops = {}
1015 Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
1016 Tsorts = [3,6,10,11]
1017 Treverse = [False,True,True,True] # so first values are worse(r)
1018 #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1019 #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1020 # -------------------hwe--------------------------
1021 # hwe has SNP TEST GENO O(HET) E(HET) P_HWD
1022 # we want all hwe where P_HWD <> NA
1023 # ah changed in 1.04 to
1024 ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
1025 ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
1026 ## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA
1027 ## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644
1028 ## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975
1029 ## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA
1030 ## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975
1031 ## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067
1032 ## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA
1033 ## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067
1034 logf.write('## marker reports starting at %s\n' % timenow())
1035 try:
1036 f = open(hwefile,'r')
1037 except:
1038 f = None
1039 logf.write('## error - no hwefile %s found\n' % hwefile)
1040 if f:
1041 for i,line in enumerate(f):
1042 ll = line.strip().split()
1043 if i == 0: # head
1044 head = [x.upper() for x in ll] # expect above
1045 try:
1046 testpos = head.index('TEST')
1047 except:
1048 testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
1049 try:
1050 ppos = head.index('P')
1051 except:
1052 ppos = 8 # patch - for head.index('P')
1053 snppos = head.index('SNP')
1054 chrpos = head.index('CHR')
1055 logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
1056 elif len(ll) >= ppos: # full line
1057 ps = ll[ppos].upper()
1058 rs = ll[snppos]
1059 chrom = ll[chrpos]
1060 test = ll[testpos]
1061 if not hwedict.get(rs,None):
1062 hwedict[rs] = {}
1063 markerlist.append(rs)
1064 chromlist.append(chrom) # one place to find it?
1065 lpvals = 0
1066 if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
1067 lpvals = '0'
1068 if ps <> '1':
1069 try:
1070 pval = float(ps)
1071 lpvals = '%f' % -math.log10(pval)
1072 except:
1073 pass
1074 hwedict[rs][test] = (ps,lpvals)
1075 else:
1076 logf.write('short line #%d = %s\n' % (i,ll))
1077 f.close()
1078 # ------------------missing--------------------
1079 """lmiss has
1080 CHR SNP N_MISS N_GENO F_MISS
1081 1 rs12354060 0 73 0
1082 1 rs4345758 1 73 0.0137
1083 1 rs2691310 73 73 1
1084 1 rs2531266 73 73 1
1085 # we want F_MISS"""
1086 try:
1087 f = open(lmissfile,'r')
1088 except:
1089 f = None
1090 if f:
1091 for i,line in enumerate(f):
1092 ll = line.strip().split()
1093 if i == 0:
1094 head = [x.upper() for x in ll] # expect above
1095 fracpos = head.index('F_MISS')
1096 npos = head.index('N_MISS')
1097 snppos = head.index('SNP')
1098 elif len(ll) >= fracpos: # full line
1099 rs = ll[snppos]
1100 fracval = ll[fracpos]
1101 lmissdict[rs] = fracval # for now, just that?
1102 else:
1103 lmissdict[rs] = 'NA'
1104 f.close()
1105 # ------------------freq-------------------
1106 # frq has CHR SNP A1 A2 MAF NM
1107 # we want maf
1108 try:
1109 f = open(freqfile,'r')
1110 except:
1111 f = None
1112 if f:
1113 for i,line in enumerate(f):
1114 ll = line.strip().split()
1115 if i == 0:
1116 head = [x.upper() for x in ll] # expect above
1117 mafpos = head.index('MAF')
1118 a1pos = head.index('A1')
1119 a2pos = head.index('A2')
1120 snppos = head.index('SNP')
1121 elif len(ll) >= mafpos: # full line
1122 rs = ll[snppos]
1123 a1 = ll[a1pos]
1124 a2 = ll[a2pos]
1125 maf = ll[mafpos]
1126 freqdict[rs] = (maf,a1,a2)
1127 f.close()
1128 # ------------------mend-------------------
1129 # lmend has CHR SNP N
1130 # we want N
1131 gotmend = True
1132 try:
1133 f = open(lmendfile,'r')
1134 except:
1135 gotmend = False
1136 for rs in markerlist:
1137 lmenddict[rs] = '0'
1138 if gotmend:
1139 for i,line in enumerate(f):
1140 ll = line.strip().split()
1141 if i == 0:
1142 head = [x.upper() for x in ll] # expect above
1143 npos = head.index('N')
1144 snppos = head.index('SNP')
1145 elif len(ll) >= npos: # full line
1146 rs = ll[snppos]
1147 nmend = ll[npos]
1148 lmenddict[rs] = nmend
1149 f.close()
1150 else:
1151 logf.write('No %s file - assuming not family data\n' % lmendfile)
1152 # now assemble result list
1153 rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
1154 res = []
1155 fres = []
1156 for rs in markerlist: # for each snp in found order
1157 f_missing = lmissdict.get(rs,'NA')
1158 maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
1159 hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
1160 hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
1161 nmend = lmenddict.get(rs,'NA')
1162 (chrom,offset)=mapdict.get(rs,('?','0'))
1163 res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
1164 ntokeep = max(10,len(res)/keepfrac)
1165
1166 def msortk(item=None):
1167 """
1168 deal with non numeric sorting
1169 """
1170 try:
1171 return float(item)
1172 except:
1173 return item
1174
1175 for i,col in enumerate(Tsorts):
1176 res.sort(key=msortk(lambda x:x[col]))
1177 if Treverse[i]:
1178 res.reverse()
1179 repname = Tnames[i]
1180 Tops[repname] = res[0:ntokeep]
1181 Tops[repname].insert(0,rhead)
1182 res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
1183 res.insert(0,rhead)
1184 f = open(outfile,'w')
1185 f.write('\n'.join(['\t'.join(x) for x in res]))
1186 f.close()
1187 return res,Tops
1188
1189
1190
1191
1192 def getfSize(fpath,outpath):
1193 """
1194 format a nice file size string
1195 """
1196 size = ''
1197 fp = os.path.join(outpath,fpath)
1198 if os.path.isfile(fp):
1199 n = float(os.path.getsize(fp))
1200 if n > 2**20:
1201 size = ' (%1.1f MB)' % (n/2**20)
1202 elif n > 2**10:
1203 size = ' (%1.1f KB)' % (n/2**10)
1204 elif n > 0:
1205 size = ' (%d B)' % (int(n))
1206 return size
1207
1208
1209 if __name__ == "__main__":
1210 u = """ called in xml as
1211 <command interpreter="python">
1212 rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
1213 -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
1214 -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
1215 </command>
1216
1217 Prepare a qc report - eg:
1218 print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
1219
1220 """
1221 progname = os.path.basename(sys.argv[0])
1222 if len(sys.argv) < 9:
1223 print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
1224 sys.exit(1)
1225 parser = OptionParser(usage=u, version="%prog 0.01")
1226 a = parser.add_option
1227 a("-i","--infile",dest="infile")
1228 a("-o","--oprefix",dest="opref")
1229 a("-l","--plinkexe",dest="plinke", default=plinke)
1230 a("-r","--rexe",dest="rexe", default=rexe)
1231 a("-s","--snps",dest="htmlf")
1232 #a("-m","--markerRaw",dest="markf")
1233 #a("-r","--rawsubject",dest="subjf")
1234 a("-p","--patho",dest="newfpath")
1235 (options,args) = parser.parse_args()
1236 basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
1237 killme = string.punctuation + string.whitespace
1238 trantab = string.maketrans(killme,'_'*len(killme))
1239 opref = options.opref.translate(trantab)
1240 alogh,alog = tempfile.mkstemp(suffix='.txt')
1241 plogh,plog = tempfile.mkstemp(suffix='.txt')
1242 alogf = open(alog,'w')
1243 plogf = open(plog,'w')
1244 ahtmlf = options.htmlf
1245 amarkf = 'MarkerDetails_%s.xls' % opref
1246 asubjf = 'SubjectDetails_%s.xls' % opref
1247 newfpath = options.newfpath
1248 newfpath = os.path.realpath(newfpath)
1249 try:
1250 os.makedirs(newfpath)
1251 except:
1252 pass
1253 ofn = basename
1254 bfn = options.infile
1255 try:
1256 mapf = '%s.bim' % bfn
1257 maplist = file(mapf,'r').readlines()
1258 maplist = [x.split() for x in maplist]
1259 except:
1260 maplist = None
1261 alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
1262 #rerla@beast galaxy]$ head test-data/tinywga.bim
1263 #22 rs2283802 0 21784722 4 2
1264 #22 rs2267000 0 21785366 4 2
1265 rgbin = os.path.split(rexe)[0] # get our rg bin path
1266 #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
1267 # if we could, do all at once? Nope. Probably never.
1268 plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
1269 ['--mendel',],['--check-sex',]]
1270 vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
1271 runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
1272 plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
1273 ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
1274 ['--bfile ldp_%s --het --out %s' % (basename,basename)]]
1275 # subset of ld independent markers for eigenstrat and other requirements
1276 vclbase = [options.plinke,'--noweb']
1277 plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
1278 plogf.write('\n'.join(plogout))
1279 plogf.write('\n')
1280 repout = os.path.join(newfpath,basename)
1281 subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
1282 logf=alogf) # writes the subject_froot.xls file
1283 markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
1284 logf=alogf,maplist=maplist) # marker_froot.xls
1285 nbreaks = 100
1286 s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
1287 alogf.write(s)
1288 print s
1289 plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
1290 basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
1291 #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
1292
1293 # [titles[n],plotnames[n],outfnames[n] ]
1294 html = []
1295 html.append('<table cellpadding="5" border="0">')
1296 size = getfSize(amarkf,newfpath)
1297 html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
1298 (amarkf,'Click here to download the Marker QC Detail report file',size))
1299 size = getfSize(asubjf,newfpath)
1300 html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
1301 (asubjf,'Click here to download the Subject QC Detail report file',size))
1302 for (title,url,ofname) in plotpage:
1303 ttitle = 'Ranked %s' % title
1304 dat = subjectTops.get(ttitle,None)
1305 if not dat:
1306 dat = markerTops.get(ttitle,None)
1307 imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
1308 thumbnail = os.path.join(newfpath,imghref)
1309 if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
1310 imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
1311 thumbnail = os.path.join(newfpath,imghref)
1312 if not os.path.exists(thumbnail):
1313 html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title))
1314 else:
1315 html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \
1316 % (url,imghref,title))
1317 if dat: # one or the other - write as an extra file and make a link here
1318 t = '%s.xls' % (ttitle.replace(' ','_'))
1319 fname = os.path.join(newfpath,t)
1320 f = file(fname,'w')
1321 f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
1322 size = getfSize(t,newfpath)
1323 html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size))
1324 else:
1325 html.append('</a></td><td>%s</td><td>&nbsp;</td></tr>' % (title))
1326 html.append('</table><hr><h3>All output files from the QC run are available below</h3>')
1327 html.append('<table cellpadding="5" border="0">\n')
1328 flist = os.listdir(newfpath) # we want to catch 'em all
1329 flist.sort()
1330 for f in flist:
1331 fname = os.path.split(f)[-1]
1332 size = getfSize(fname,newfpath)
1333 html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size))
1334 html.append('</table>')
1335 alogf.close()
1336 plogf.close()
1337 llog = open(alog,'r').readlines()
1338 plogfile = open(plog,'r').readlines()
1339 os.unlink(alog)
1340 os.unlink(plog)
1341 llog += plogfile # add lines from pruneld log
1342 lf = file(ahtmlf,'w') # galaxy will show this as the default view
1343 lf.write(galhtmlprefix % progname)
1344 s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow())
1345 lf.write('<h4>%s</h4>\n' % s)
1346 lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n')
1347 lf.write('\n'.join(html))
1348 lf.write('<h4>QC run log contents</h4>')
1349 lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs
1350 if len(cruft) > 0:
1351 lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup
1352 lf.write('%s\n<hr>\n' % galhtmlpostfix)
1353 lf.close()
1354