Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgCaCo.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/local/bin/python | |
| 2 # hack to run and process a plink case control association | |
| 3 # expects args as | |
| 4 # bfilepath outname jobname outformat (wig,xls) | |
| 5 # ross lazarus | |
| 6 # for wig files, we need annotation so look for map file or complain | |
| 7 """ | |
| 8 Parameters for wiggle track definition lines | |
| 9 All options are placed in a single line separated by spaces: | |
| 10 | |
| 11 track type=wiggle_0 name=track_label description=center_label \ | |
| 12 visibility=display_mode color=r,g,b altColor=r,g,b \ | |
| 13 priority=priority autoScale=on|off \ | |
| 14 gridDefault=on|off maxHeightPixels=max:default:min \ | |
| 15 graphType=bar|points viewLimits=lower:upper \ | |
| 16 yLineMark=real-value yLineOnOff=on|off \ | |
| 17 windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 | |
| 18 """ | |
| 19 | |
| 20 import sys,math,shutil,subprocess,os,time,tempfile,string | |
| 21 from os.path import abspath | |
| 22 from rgutils import timenow, plinke | |
| 23 imagedir = '/static/rg' # if needed for images | |
| 24 myversion = 'V000.1 April 2007' | |
| 25 verbose = False | |
| 26 | |
| 27 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): | |
| 28 """ | |
| 29 score must be scaled to 0-1000 | |
| 30 | |
| 31 Want to make some wig tracks from each analysis | |
| 32 Best n -log10(p). Make top hit the window. | |
| 33 we use our tab output which has | |
| 34 rs chrom offset ADD_stat ADD_p ADD_log10p | |
| 35 rs3094315 1 792429 1.151 0.2528 0.597223 | |
| 36 | |
| 37 """ | |
| 38 | |
| 39 def is_number(s): | |
| 40 try: | |
| 41 float(s) | |
| 42 return True | |
| 43 except ValueError: | |
| 44 return False | |
| 45 header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) | |
| 46 column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] | |
| 47 halfwidth=100 | |
| 48 resfpath = os.path.join(twd,resf) | |
| 49 resf = open(resfpath,'r') | |
| 50 resfl = resf.readlines() # dumb but convenient for millions of rows | |
| 51 resfl = [x.split() for x in resfl] | |
| 52 headl = resfl[0] | |
| 53 resfl = resfl[1:] | |
| 54 headl = [x.strip().upper() for x in headl] | |
| 55 headIndex = dict(zip(headl,range(0,len(headl)))) | |
| 56 whatwewant = ['CHR','RS','OFFSET','LOG10ARMITAGEP'] | |
| 57 wewant = [headIndex.get(x,None) for x in whatwewant] | |
| 58 if None in wewant: # missing something | |
| 59 logf.write('### Error missing a required header from %s in makeGFF - headIndex=%s\n' % (whatwewant,headIndex)) | |
| 60 return | |
| 61 ppos = wewant[3] # last in list | |
| 62 resfl = [x for x in resfl if x[ppos] > '' and x[ppos] <> 'NA'] | |
| 63 resfl = [(float(x[ppos]),x) for x in resfl] # decorate | |
| 64 resfl.sort() | |
| 65 resfl.reverse() # using -log10 so larger is better | |
| 66 pvals = [x[0] for x in resfl] # need to scale | |
| 67 resfl = [x[1] for x in resfl] # drop decoration | |
| 68 resfl = resfl[:topn] # truncate | |
| 69 maxp = max(pvals) # need to scale | |
| 70 minp = min(pvals) | |
| 71 prange = abs(maxp-minp) + 0.5 # fudge | |
| 72 scalefact = 1000.0/prange | |
| 73 logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) | |
| 74 for i,row in enumerate(resfl): | |
| 75 row[ppos] = '%d' % (int(scalefact*pvals[i])) | |
| 76 resfl[i] = row # replace | |
| 77 outf = file(outfname,'w') | |
| 78 outf.write(header) | |
| 79 outres = [] # need to resort into chrom offset order | |
| 80 for i,lrow in enumerate(resfl): | |
| 81 chrom,snp,offset,p, = [lrow[x] for x in wewant] | |
| 82 gff = ('chr%s' % chrom,'rgCaCo','variation','%d' % (int(offset)-halfwidth), | |
| 83 '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) | |
| 84 outres.append(gff) | |
| 85 outres = [(x[0],int(x[3]),x) for x in outres] # decorate | |
| 86 outres.sort() # into chrom offset | |
| 87 outres=[x[2] for x in outres] # undecorate | |
| 88 outres = ['\t'.join(x) for x in outres] | |
| 89 outf.write('\n'.join(outres)) | |
| 90 outf.write('\n') | |
| 91 outf.close() | |
| 92 | |
| 93 | |
| 94 def plink_assocToGG(plinkout="hm",tag='test'): | |
| 95 """ plink --assoc output looks like this | |
| 96 # CHR SNP A1 F_A F_U A2 CHISQ P OR | |
| 97 # 1 rs3094315 G 0.6685 0.1364 A 104.1 1.929e-24 12.77 | |
| 98 # write as a genegraph input file | |
| 99 """ | |
| 100 inf = file('%s.assoc' % plinkout,'r') | |
| 101 outf = file('%sassoc.xls' % plinkout,'w') | |
| 102 res = ['rs\tlog10p%s\tFakeInvOR%s\tRealOR%s' % (tag,tag,tag),] # output header for ucsc genome graphs | |
| 103 head = inf.next() | |
| 104 for l in inf: | |
| 105 ll = l.split() | |
| 106 if len(ll) >= 8: | |
| 107 p = float(ll[7]) | |
| 108 if p <> 'NA': # eesh | |
| 109 logp = '%9.9f' % -math.log10(p) | |
| 110 else: | |
| 111 logp = 'NA' | |
| 112 try: | |
| 113 orat = ll[8] | |
| 114 except: | |
| 115 orat = 'NA' | |
| 116 orat2 = orat | |
| 117 # invert large negative odds ratios | |
| 118 if float(orat) < 1 and float(orat) > 0.0: | |
| 119 orat2 = '%9.9f' % (1.0/float(orat)) | |
| 120 outl = [ll[1],logp, orat2, orat] | |
| 121 res.append('\t'.join(outl)) | |
| 122 outf.write('\n'.join(res)) | |
| 123 outf.write('\n') | |
| 124 outf.close() | |
| 125 inf.close() | |
| 126 | |
| 127 def xformModel(infname='',resf='',outfname='', | |
| 128 name='foo',mapf='/usr/local/galaxy/data/rg/ped/x.bim',flog=None): | |
| 129 """munge a plink .model file into either a ucsc track or an xls file | |
| 130 rerla@meme ~/plink]$ head hmYRI_CEU.model | |
| 131 CHR SNP TEST AFF UNAFF CHISQ DF P | |
| 132 1 rs3094315 GENO 41/37/11 0/24/64 NA NA NA | |
| 133 1 rs3094315 TREND 119/59 24/152 81.05 1 2.201e-19 | |
| 134 1 rs3094315 ALLELIC 119/59 24/152 104.1 1 1.929e-24 | |
| 135 1 rs3094315 DOM 78/11 24/64 NA NA NA | |
| 136 | |
| 137 bim file has | |
| 138 [rerla@beast pbed]$ head plink_wgas1_example.bim | |
| 139 1 rs3094315 0.792429 792429 G A | |
| 140 1 rs6672353 0.817376 817376 A G | |
| 141 """ | |
| 142 if verbose: | |
| 143 print 'Rgenetics rgCaCo.xformModel got resf=%s, outfname=%s' % (resf,outfname) | |
| 144 res = [] | |
| 145 rsdict = {} | |
| 146 map = file(mapf,'r') | |
| 147 for l in map: # plink map | |
| 148 ll = l.strip().split() | |
| 149 if len(ll) >= 3: | |
| 150 rs=ll[1].strip() | |
| 151 chrom = ll[0] | |
| 152 if chrom.lower() == 'x': | |
| 153 chrom='23' | |
| 154 elif chrom.lower() == 'y': | |
| 155 chrom = 24 | |
| 156 elif chrom.lower() == 'mito': | |
| 157 chrom = 25 | |
| 158 offset = ll[3] | |
| 159 rsdict[rs] = (chrom,offset) | |
| 160 res.append('rs\tChr\tOffset\tGenop\tlog10Genop\tArmitagep\tlog10Armitagep\tAllelep\tlog10Allelep\tDomp\tlog10Domp') | |
| 161 f = open(resf,'r') | |
| 162 headl = f.readline() | |
| 163 if headl.find('\t') <> -1: | |
| 164 headl = headl.split('\t') | |
| 165 delim = '\t' | |
| 166 else: | |
| 167 headl = headl.split() | |
| 168 delim = None | |
| 169 whatwewant = ['CHR','SNP','TEST','AFF','UNAFF','CHISQ','P'] | |
| 170 wewant = [headl.index(x) for x in whatwewant] | |
| 171 llen = len(headl) | |
| 172 lnum = anum = 0 | |
| 173 lastsnp = None # so we know when to write out a gg line | |
| 174 outl = {} | |
| 175 f.seek(0) | |
| 176 for lnum,l in enumerate(f): | |
| 177 if lnum == 0: | |
| 178 continue | |
| 179 ll = l.split() | |
| 180 if delim: | |
| 181 ll = l.split(delim) | |
| 182 if len(ll) >= llen: # valid line | |
| 183 chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant] | |
| 184 snp = snp.strip() | |
| 185 chrom,offset = rsdict.get(snp,(None,None)) | |
| 186 anum += 1 | |
| 187 fp = 1.0 # if NA | |
| 188 lp = 0.0 | |
| 189 try: | |
| 190 fp = float(p) | |
| 191 if fp > 0: | |
| 192 lp = -math.log10(fp) | |
| 193 else: | |
| 194 fp = 9e-100 | |
| 195 flog.write('### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\n' % (p,test)) | |
| 196 flog.write('### offending line #%d in %s = %s' % (lnum,l)) | |
| 197 except: | |
| 198 pass | |
| 199 if snp <> lastsnp: | |
| 200 if len(outl.keys()) > 3: | |
| 201 sl = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] | |
| 202 res.append('\t'.join(sl)) # last snp line | |
| 203 outl = {'snp':snp,'chrom':chrom,'offset':offset} # first 3 cols for gg line | |
| 204 lastsnp = snp # reset for next marker | |
| 205 #if p == 'NA': | |
| 206 # p = 1.0 | |
| 207 # let's pass downstream for handling R is fine? | |
| 208 outl[test] = '%s\t%f' % (p,lp) | |
| 209 if len(outl.keys()) > 3: | |
| 210 l = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] | |
| 211 res.append('\t'.join(l)) # last snp line | |
| 212 f = file(outfname,'w') | |
| 213 res.append('') | |
| 214 f.write('\n'.join(res)) | |
| 215 f.close() | |
| 216 | |
| 217 | |
| 218 | |
| 219 | |
| 220 if __name__ == "__main__": | |
| 221 """ | |
| 222 # called as | |
| 223 <command interpreter="python"> | |
| 224 rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name" | |
| 225 '$out_file1' '$logf' '$logf.files_path' '$gffout' | |
| 226 </command> </command> | |
| 227 """ | |
| 228 if len(sys.argv) < 7: | |
| 229 s = 'rgCaCo.py needs 6 params - got %s \n' % (sys.argv) | |
| 230 print >> sys.stdout, s | |
| 231 sys.exit(0) | |
| 232 bfname = sys.argv[1] | |
| 233 name = sys.argv[2] | |
| 234 killme = string.punctuation + string.whitespace | |
| 235 trantab = string.maketrans(killme,'_'*len(killme)) | |
| 236 name = name.translate(trantab) | |
| 237 outfname = sys.argv[3] | |
| 238 logf = sys.argv[4] | |
| 239 logoutdir = sys.argv[5] | |
| 240 gffout = sys.argv[6] | |
| 241 topn = 1000 | |
| 242 try: | |
| 243 os.makedirs(logoutdir) | |
| 244 except: | |
| 245 pass | |
| 246 map_file = None | |
| 247 me = sys.argv[0] | |
| 248 amapf = '%s.bim' % bfname # to decode map in xformModel | |
| 249 flog = file(logf,'w') | |
| 250 logme = [] | |
| 251 cdir = os.getcwd() | |
| 252 s = 'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\n' % (myversion,timenow()) | |
| 253 print >> sys.stdout, s # so will appear as blurb for file | |
| 254 logme.append(s) | |
| 255 if verbose: | |
| 256 s = 'rgCaCo.py: bfname=%s, logf=%s, argv = %s\n' % (bfname, logf, sys.argv) | |
| 257 print >> sys.stdout, s # so will appear as blurb for file | |
| 258 logme.append(s) | |
| 259 twd = tempfile.mkdtemp(suffix='rgCaCo') # make sure plink doesn't spew log file into the root! | |
| 260 tname = os.path.join(twd,name) | |
| 261 vcl = [plinke,'--noweb','--bfile',bfname,'--out',name,'--model'] | |
| 262 p=subprocess.Popen(' '.join(vcl),shell=True,stdout=flog,cwd=twd) | |
| 263 retval = p.wait() | |
| 264 resf = '%s.model' % tname # plink output is here we hope | |
| 265 xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file | |
| 266 makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgCaCo_TopTable',description=name,topn=topn) | |
| 267 flog.write('\n'.join(logme)) | |
| 268 flog.close() # close the log used | |
| 269 #shutil.copytree(twd,logoutdir) | |
| 270 shutil.rmtree(twd) # clean up | |
| 271 |
