Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgGLM.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/local/bin/python | |
| 2 """ | |
| 3 # added most of the available options for linear models | |
| 4 # june 2009 rml | |
| 5 # hack to run and process a plink quantitative trait | |
| 6 # | |
| 7 | |
| 8 This is a wrapper for Shaun Purcell's Plink linear/logistic models for | |
| 9 traits, covariates and genotypes. | |
| 10 | |
| 11 It requires some judgement to interpret the findings | |
| 12 We need some better visualizations - manhattan plots are good. | |
| 13 svg with rs numbers for top 1%? | |
| 14 | |
| 15 toptable tools - truncate a gg file down to some low percentile | |
| 16 | |
| 17 intersect with other tables - eg gene expression regressions on snps | |
| 18 | |
| 19 | |
| 20 | |
| 21 """ | |
| 22 | |
| 23 import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands | |
| 24 from rgutils import plinke | |
| 25 | |
| 26 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): | |
| 27 """ | |
| 28 score must be scaled to 0-1000 | |
| 29 | |
| 30 Want to make some wig tracks from each analysis | |
| 31 Best n -log10(p). Make top hit the window. | |
| 32 we use our tab output which has | |
| 33 rs chrom offset ADD_stat ADD_p ADD_log10p | |
| 34 rs3094315 1 792429 1.151 0.2528 0.597223 | |
| 35 | |
| 36 """ | |
| 37 | |
| 38 def is_number(s): | |
| 39 try: | |
| 40 float(s) | |
| 41 return True | |
| 42 except ValueError: | |
| 43 return False | |
| 44 header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) | |
| 45 column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] | |
| 46 halfwidth=100 | |
| 47 resfpath = os.path.join(twd,resf) | |
| 48 resf = open(resfpath,'r') | |
| 49 resfl = resf.readlines() # dumb but convenient for millions of rows | |
| 50 resfl = [x.split() for x in resfl] | |
| 51 headl = resfl[0] | |
| 52 resfl = resfl[1:] | |
| 53 headl = [x.strip().upper() for x in headl] | |
| 54 headIndex = dict(zip(headl,range(0,len(headl)))) | |
| 55 chrpos = headIndex.get('CHROM',None) | |
| 56 rspos = headIndex.get('RS',None) | |
| 57 offspos = headIndex.get('OFFSET',None) | |
| 58 ppos = headIndex.get('ADD_LOG10P',None) | |
| 59 wewant = [chrpos,rspos,offspos,ppos] | |
| 60 if None in wewant: # missing something | |
| 61 logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) | |
| 62 return | |
| 63 resfl = [x for x in resfl if x[ppos] > ''] | |
| 64 resfl = [(float(x[ppos]),x) for x in resfl] # decorate | |
| 65 resfl.sort() | |
| 66 resfl.reverse() # using -log10 so larger is better | |
| 67 resfl = resfl[:topn] # truncate | |
| 68 pvals = [x[0] for x in resfl] # need to scale | |
| 69 resfl = [x[1] for x in resfl] # drop decoration | |
| 70 if len(pvals) == 0: | |
| 71 logf.write('### no pvalues found in resfl - %s' % (resfl[:3])) | |
| 72 sys.exit(1) | |
| 73 maxp = max(pvals) # need to scale | |
| 74 minp = min(pvals) | |
| 75 prange = abs(maxp-minp) + 0.5 # fudge | |
| 76 scalefact = 1000.0/prange | |
| 77 logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) | |
| 78 for i,row in enumerate(resfl): | |
| 79 row[ppos] = '%d' % (int(scalefact*pvals[i])) | |
| 80 resfl[i] = row # replace | |
| 81 outf = file(outfname,'w') | |
| 82 outf.write(header) | |
| 83 outres = [] # need to resort into chrom offset order | |
| 84 for i,lrow in enumerate(resfl): | |
| 85 chrom,snp,offset,p, = [lrow[x] for x in wewant] | |
| 86 gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), | |
| 87 '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) | |
| 88 outres.append(gff) | |
| 89 outres = [(x[0],int(x[3]),x) for x in outres] # decorate | |
| 90 outres.sort() # into chrom offset | |
| 91 outres=[x[2] for x in outres] # undecorate | |
| 92 outres = ['\t'.join(x) for x in outres] | |
| 93 outf.write('\n'.join(outres)) | |
| 94 outf.write('\n') | |
| 95 outf.close() | |
| 96 | |
| 97 | |
| 98 | |
| 99 def xformQassoc(resf='',outfname='',logf=None,twd='.'): | |
| 100 """ plink.assoc.linear to gg file | |
| 101 from the docs | |
| 102 The output per each SNP might look something like: | |
| 103 | |
| 104 CHR SNP BP A1 TEST NMISS OR STAT P | |
| 105 5 rs000001 10001 A ADD 664 0.7806 -1.942 0.05216 | |
| 106 5 rs000001 10001 A DOMDEV 664 0.9395 -0.3562 0.7217 | |
| 107 5 rs000001 10001 A COV1 664 0.9723 -0.7894 0.4299 | |
| 108 5 rs000001 10001 A COV2 664 1.159 0.5132 0.6078 | |
| 109 5 rs000001 10001 A GENO_2DF 664 NA 5.059 0.0797 | |
| 110 need to transform into gg columns for each distinct test | |
| 111 or bed for tracks? | |
| 112 | |
| 113 """ | |
| 114 logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname)) | |
| 115 resdict = {} | |
| 116 rsdict = {} | |
| 117 markerlist = [] | |
| 118 # plink is "clever" - will run logistic if only 2 categories such as gender | |
| 119 resfs = resf.split('.') | |
| 120 if resfs[-1] == 'logistic': | |
| 121 resfs[-1] = 'linear' | |
| 122 else: | |
| 123 resfs[-1] = 'logistic' | |
| 124 altresf = '.'.join(resfs) | |
| 125 | |
| 126 altresfpath = os.path.join(twd,altresf) | |
| 127 resfpath = os.path.join(twd,resf) | |
| 128 try: | |
| 129 resf = open(resfpath,'r') | |
| 130 except: | |
| 131 try: | |
| 132 resf = open(altresfpath,'r') | |
| 133 except: | |
| 134 print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath) | |
| 135 sys.exit(1) | |
| 136 for lnum,row in enumerate(resf): | |
| 137 if lnum == 0: | |
| 138 headl = row.split() | |
| 139 headl = [x.strip().upper() for x in headl] | |
| 140 headIndex = dict(zip(headl,range(0,len(headl)))) | |
| 141 chrpos = headIndex.get('CHR',None) | |
| 142 rspos = headIndex.get('SNP',None) | |
| 143 offspos = headIndex.get('BP',None) | |
| 144 nmisspos = headIndex.get('NMISS',None) | |
| 145 testpos = headIndex.get('TEST',None) | |
| 146 ppos = headIndex.get('P',None) | |
| 147 coeffpos = headIndex.get('OR',None) | |
| 148 if not coeffpos: | |
| 149 coeffpos = headIndex.get('BETA',None) | |
| 150 apos = headIndex.get('A1',None) | |
| 151 statpos = headIndex.get('STAT',None) | |
| 152 wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos] | |
| 153 if None in wewant: # missing something | |
| 154 logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex) | |
| 155 return | |
| 156 llen = len(headl) | |
| 157 else: # no Nones! | |
| 158 ll = row.split() | |
| 159 if len(ll) >= llen: # valid line | |
| 160 chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant] | |
| 161 snp = snp.strip() | |
| 162 if p <> 'NA' : | |
| 163 try: | |
| 164 ffp = float(p) | |
| 165 if ffp <> 0: | |
| 166 lp = -math.log10(ffp) | |
| 167 except: | |
| 168 lp = 0.0 | |
| 169 resdict.setdefault(test,{}) | |
| 170 resdict[test][snp] = (stat,p,'%f' % lp) | |
| 171 if rsdict.get(snp,None) == None: | |
| 172 rsdict[snp] = (chrom,offset) | |
| 173 markerlist.append(snp) | |
| 174 # now have various tests indexed by rs | |
| 175 tk = resdict.keys() | |
| 176 tk.sort() # tests | |
| 177 ohead = ['rs','chrom','offset'] | |
| 178 for t in tk: # add headers | |
| 179 ohead.append('%s_stat' % t) | |
| 180 ohead.append('%s_p' % t) | |
| 181 ohead.append('%s_log10p' % t) | |
| 182 oheads = '\t'.join(ohead) | |
| 183 res = [oheads,] | |
| 184 for snp in markerlist: # retain original order | |
| 185 chrom,offset = rsdict[snp] | |
| 186 outl = [snp,chrom,offset] | |
| 187 for t in tk: | |
| 188 outl += resdict[t][snp] # add stat,p for this test | |
| 189 outs = '\t'.join(outl) | |
| 190 res.append(outs) | |
| 191 f = file(outfname,'w') | |
| 192 res.append('') | |
| 193 f.write('\n'.join(res)) | |
| 194 f.close() | |
| 195 | |
| 196 | |
| 197 if __name__ == "__main__": | |
| 198 """ | |
| 199 | |
| 200 <command interpreter="python"> | |
| 201 rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name' | |
| 202 "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' | |
| 203 '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout' | |
| 204 </command> | |
| 205 """ | |
| 206 topn = 1000 | |
| 207 killme = string.punctuation+string.whitespace | |
| 208 trantab = string.maketrans(killme,'_'*len(killme)) | |
| 209 if len(sys.argv) < 17: | |
| 210 s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv) | |
| 211 sys.stderr.write(s) # print >>,s would probably also work? | |
| 212 sys.exit(0) | |
| 213 blurb = 'rgGLM.py called with %s' % sys.argv | |
| 214 print >> sys.stdout,blurb | |
| 215 bfname = sys.argv[1] | |
| 216 phename = sys.argv[2] | |
| 217 title = sys.argv[3] | |
| 218 title.translate(trantab) | |
| 219 predvar = sys.argv[4] | |
| 220 covar = sys.argv[5].strip() | |
| 221 outfname = sys.argv[6] | |
| 222 logfname = sys.argv[7] | |
| 223 op = os.path.split(logfname)[0] | |
| 224 try: # for test - needs this done | |
| 225 os.makedirs(op) | |
| 226 except: | |
| 227 pass | |
| 228 basename = sys.argv[8].translate(trantab) | |
| 229 inter = sys.argv[9] == '1' | |
| 230 cond = sys.argv[10].strip() | |
| 231 if cond == 'None': | |
| 232 cond = '' | |
| 233 gender = sys.argv[11] == '1' | |
| 234 mind = sys.argv[12] | |
| 235 geno = sys.argv[13] | |
| 236 maf = sys.argv[14] | |
| 237 logistic = sys.argv[15].strip()=='1' | |
| 238 gffout = sys.argv[16] | |
| 239 me = sys.argv[0] | |
| 240 phepath = '%s.pphe' % phename | |
| 241 twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root! | |
| 242 tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log | |
| 243 vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno', | |
| 244 phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno, | |
| 245 '--maf %s' % maf] | |
| 246 if logistic: | |
| 247 vcl.append('--logistic') | |
| 248 resf = '%s.assoc.logistic' % basename # plink output is here we hope | |
| 249 else: | |
| 250 vcl.append('--linear') | |
| 251 resf = '%s.assoc.linear' % basename # plink output is here we hope | |
| 252 resf = os.path.join(twd,resf) | |
| 253 if gender: | |
| 254 vcl.append('--sex') | |
| 255 if inter: | |
| 256 vcl.append('--interaction') | |
| 257 if covar > 'None': | |
| 258 vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates | |
| 259 tcfile = None | |
| 260 if len(cond) > 0: # plink wants these in a file.. | |
| 261 dummy,tcfile = tempfile.mkstemp(suffix='condlist') # | |
| 262 f = open(tcfile,'w') | |
| 263 cl = cond.split() | |
| 264 f.write('\n'.join(cl)) | |
| 265 f.write('\n') | |
| 266 f.close() | |
| 267 vcl.append('--condition-list %s' % tcfile) | |
| 268 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) | |
| 269 retval = p.wait() | |
| 270 if tcfile: | |
| 271 os.unlink(tcfile) | |
| 272 plinklog = file(tplog,'r').read() | |
| 273 logf = file(logfname,'w') | |
| 274 logf.write(blurb) | |
| 275 logf.write('\n') | |
| 276 logf.write('vcl=%s\n' % vcl) | |
| 277 xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file | |
| 278 makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn) | |
| 279 logf.write('\n') | |
| 280 logf.write(plinklog) | |
| 281 logf.close() | |
| 282 #shutil.rmtree(twd) # clean up | |
| 283 | |
| 284 | |
| 285 | |
| 286 | |
| 287 |
