| 0 | 1 #!/usr/local/bin/python | 
|  | 2 # hack to run and process a plink case control association | 
|  | 3 # expects args as | 
|  | 4 # bfilepath outname jobname outformat (wig,xls) | 
|  | 5 # ross lazarus | 
|  | 6 # for wig files, we need annotation so look for map file or complain | 
|  | 7 """ | 
|  | 8 Parameters for wiggle track definition lines | 
|  | 9 All options are placed in a single line separated by spaces: | 
|  | 10 | 
|  | 11   track type=wiggle_0 name=track_label description=center_label \ | 
|  | 12         visibility=display_mode color=r,g,b altColor=r,g,b \ | 
|  | 13         priority=priority autoScale=on|off \ | 
|  | 14         gridDefault=on|off maxHeightPixels=max:default:min \ | 
|  | 15         graphType=bar|points viewLimits=lower:upper \ | 
|  | 16         yLineMark=real-value yLineOnOff=on|off \ | 
|  | 17         windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 | 
|  | 18 """ | 
|  | 19 | 
|  | 20 import sys,math,shutil,subprocess,os,time,tempfile,string | 
|  | 21 from os.path import abspath | 
|  | 22 from rgutils import timenow, plinke | 
|  | 23 imagedir = '/static/rg' # if needed for images | 
|  | 24 myversion = 'V000.1 April 2007' | 
|  | 25 verbose = False | 
|  | 26 | 
|  | 27 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): | 
|  | 28     """ | 
|  | 29     score must be scaled to 0-1000 | 
|  | 30 | 
|  | 31     Want to make some wig tracks from each analysis | 
|  | 32     Best n -log10(p). Make top hit the window. | 
|  | 33     we use our tab output which has | 
|  | 34     rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p | 
|  | 35     rs3094315	1	792429	1.151	0.2528	0.597223 | 
|  | 36 | 
|  | 37     """ | 
|  | 38 | 
|  | 39     def is_number(s): | 
|  | 40         try: | 
|  | 41             float(s) | 
|  | 42             return True | 
|  | 43         except ValueError: | 
|  | 44             return False | 
|  | 45     header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) | 
|  | 46     column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] | 
|  | 47     halfwidth=100 | 
|  | 48     resfpath = os.path.join(twd,resf) | 
|  | 49     resf = open(resfpath,'r') | 
|  | 50     resfl = resf.readlines() # dumb but convenient for millions of rows | 
|  | 51     resfl = [x.split() for x in resfl] | 
|  | 52     headl = resfl[0] | 
|  | 53     resfl = resfl[1:] | 
|  | 54     headl = [x.strip().upper() for x in headl] | 
|  | 55     headIndex = dict(zip(headl,range(0,len(headl)))) | 
|  | 56     whatwewant = ['CHR','RS','OFFSET','LOG10ARMITAGEP'] | 
|  | 57     wewant = [headIndex.get(x,None) for x in whatwewant] | 
|  | 58     if None in wewant: # missing something | 
|  | 59        logf.write('### Error missing a required header from %s in makeGFF - headIndex=%s\n' % (whatwewant,headIndex)) | 
|  | 60        return | 
|  | 61     ppos = wewant[3] # last in list | 
|  | 62     resfl = [x for x in resfl if x[ppos] > '' and x[ppos] <> 'NA'] | 
|  | 63     resfl = [(float(x[ppos]),x) for x in resfl] # decorate | 
|  | 64     resfl.sort() | 
|  | 65     resfl.reverse() # using -log10 so larger is better | 
|  | 66     pvals = [x[0] for x in resfl] # need to scale | 
|  | 67     resfl = [x[1] for x in resfl] # drop decoration | 
|  | 68     resfl = resfl[:topn] # truncate | 
|  | 69     maxp = max(pvals) # need to scale | 
|  | 70     minp = min(pvals) | 
|  | 71     prange = abs(maxp-minp) + 0.5 # fudge | 
|  | 72     scalefact = 1000.0/prange | 
|  | 73     logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) | 
|  | 74     for i,row in enumerate(resfl): | 
|  | 75         row[ppos] = '%d' % (int(scalefact*pvals[i])) | 
|  | 76         resfl[i] = row # replace | 
|  | 77     outf = file(outfname,'w') | 
|  | 78     outf.write(header) | 
|  | 79     outres = [] # need to resort into chrom offset order | 
|  | 80     for i,lrow in enumerate(resfl): | 
|  | 81         chrom,snp,offset,p, = [lrow[x] for x in wewant] | 
|  | 82         gff = ('chr%s' % chrom,'rgCaCo','variation','%d' % (int(offset)-halfwidth), | 
|  | 83                '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) | 
|  | 84         outres.append(gff) | 
|  | 85     outres = [(x[0],int(x[3]),x) for x in outres] # decorate | 
|  | 86     outres.sort() # into chrom offset | 
|  | 87     outres=[x[2] for x in outres] # undecorate | 
|  | 88     outres = ['\t'.join(x) for x in outres] | 
|  | 89     outf.write('\n'.join(outres)) | 
|  | 90     outf.write('\n') | 
|  | 91     outf.close() | 
|  | 92 | 
|  | 93 | 
|  | 94 def plink_assocToGG(plinkout="hm",tag='test'): | 
|  | 95    """ plink --assoc output looks like this | 
|  | 96    #  CHR         SNP   A1      F_A      F_U   A2        CHISQ            P           OR | 
|  | 97    #   1   rs3094315    G   0.6685   0.1364    A        104.1    1.929e-24        12.77 | 
|  | 98    # write as a genegraph input file | 
|  | 99    """ | 
|  | 100    inf = file('%s.assoc' % plinkout,'r') | 
|  | 101    outf = file('%sassoc.xls' % plinkout,'w') | 
|  | 102    res = ['rs\tlog10p%s\tFakeInvOR%s\tRealOR%s' % (tag,tag,tag),] # output header for ucsc genome graphs | 
|  | 103    head = inf.next() | 
|  | 104    for l in inf: | 
|  | 105     ll = l.split() | 
|  | 106     if len(ll) >= 8: | 
|  | 107       p = float(ll[7]) | 
|  | 108       if p <> 'NA': # eesh | 
|  | 109           logp = '%9.9f' % -math.log10(p) | 
|  | 110       else: | 
|  | 111           logp = 'NA' | 
|  | 112       try: | 
|  | 113          orat = ll[8] | 
|  | 114       except: | 
|  | 115          orat = 'NA' | 
|  | 116       orat2 = orat | 
|  | 117       # invert large negative odds ratios | 
|  | 118       if float(orat) < 1 and float(orat) > 0.0: | 
|  | 119          orat2 = '%9.9f' % (1.0/float(orat)) | 
|  | 120       outl = [ll[1],logp, orat2, orat] | 
|  | 121       res.append('\t'.join(outl)) | 
|  | 122    outf.write('\n'.join(res)) | 
|  | 123    outf.write('\n') | 
|  | 124    outf.close() | 
|  | 125    inf.close() | 
|  | 126 | 
|  | 127 def xformModel(infname='',resf='',outfname='', | 
|  | 128                name='foo',mapf='/usr/local/galaxy/data/rg/ped/x.bim',flog=None): | 
|  | 129     """munge a plink .model file into either a ucsc track or an xls file | 
|  | 130     rerla@meme ~/plink]$ head hmYRI_CEU.model | 
|  | 131     CHR         SNP     TEST            AFF          UNAFF        CHISQ   DF            P | 
|  | 132     1   rs3094315     GENO       41/37/11        0/24/64           NA   NA           NA | 
|  | 133     1   rs3094315    TREND         119/59         24/152        81.05    1    2.201e-19 | 
|  | 134     1   rs3094315  ALLELIC         119/59         24/152        104.1    1    1.929e-24 | 
|  | 135     1   rs3094315      DOM          78/11          24/64           NA   NA           NA | 
|  | 136 | 
|  | 137     bim file has | 
|  | 138 [rerla@beast pbed]$ head plink_wgas1_example.bim | 
|  | 139 1	rs3094315	0.792429	792429	G	A | 
|  | 140 1	rs6672353	0.817376	817376	A	G | 
|  | 141     """ | 
|  | 142     if verbose: | 
|  | 143         print 'Rgenetics rgCaCo.xformModel got resf=%s,  outfname=%s' % (resf,outfname) | 
|  | 144     res = [] | 
|  | 145     rsdict = {} | 
|  | 146     map = file(mapf,'r') | 
|  | 147     for l in map: # plink map | 
|  | 148         ll = l.strip().split() | 
|  | 149         if len(ll) >= 3: | 
|  | 150             rs=ll[1].strip() | 
|  | 151             chrom = ll[0] | 
|  | 152             if chrom.lower() == 'x': | 
|  | 153                 chrom='23' | 
|  | 154             elif chrom.lower() == 'y': | 
|  | 155                 chrom = 24 | 
|  | 156             elif chrom.lower() == 'mito': | 
|  | 157                 chrom = 25 | 
|  | 158             offset = ll[3] | 
|  | 159             rsdict[rs] = (chrom,offset) | 
|  | 160     res.append('rs\tChr\tOffset\tGenop\tlog10Genop\tArmitagep\tlog10Armitagep\tAllelep\tlog10Allelep\tDomp\tlog10Domp') | 
|  | 161     f = open(resf,'r') | 
|  | 162     headl = f.readline() | 
|  | 163     if headl.find('\t') <> -1: | 
|  | 164        headl = headl.split('\t') | 
|  | 165        delim = '\t' | 
|  | 166     else: | 
|  | 167        headl = headl.split() | 
|  | 168        delim = None | 
|  | 169     whatwewant = ['CHR','SNP','TEST','AFF','UNAFF','CHISQ','P'] | 
|  | 170     wewant = [headl.index(x) for x in whatwewant] | 
|  | 171     llen = len(headl) | 
|  | 172     lnum = anum = 0 | 
|  | 173     lastsnp = None # so we know when to write out a gg line | 
|  | 174     outl = {} | 
|  | 175     f.seek(0) | 
|  | 176     for lnum,l in enumerate(f): | 
|  | 177         if lnum == 0: | 
|  | 178             continue | 
|  | 179         ll = l.split() | 
|  | 180         if delim: | 
|  | 181            ll = l.split(delim) | 
|  | 182         if len(ll) >= llen: # valid line | 
|  | 183             chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant] | 
|  | 184             snp = snp.strip() | 
|  | 185             chrom,offset = rsdict.get(snp,(None,None)) | 
|  | 186             anum += 1 | 
|  | 187             fp = 1.0 # if NA | 
|  | 188             lp = 0.0 | 
|  | 189             try: | 
|  | 190                 fp = float(p) | 
|  | 191                 if fp > 0: | 
|  | 192                   lp = -math.log10(fp) | 
|  | 193                 else: | 
|  | 194                     fp = 9e-100 | 
|  | 195                     flog.write('### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\n' % (p,test)) | 
|  | 196                     flog.write('### offending line #%d in %s = %s' % (lnum,l)) | 
|  | 197             except: | 
|  | 198                 pass | 
|  | 199             if snp <> lastsnp: | 
|  | 200                 if len(outl.keys()) > 3: | 
|  | 201                     sl = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] | 
|  | 202                     res.append('\t'.join(sl)) # last snp line | 
|  | 203                 outl = {'snp':snp,'chrom':chrom,'offset':offset} # first 3 cols for gg line | 
|  | 204                 lastsnp = snp # reset for next marker | 
|  | 205             #if p == 'NA': | 
|  | 206             #      p = 1.0 | 
|  | 207             # let's pass downstream for handling R is fine? | 
|  | 208             outl[test] = '%s\t%f' % (p,lp) | 
|  | 209     if len(outl.keys()) > 3: | 
|  | 210         l = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] | 
|  | 211         res.append('\t'.join(l)) # last snp line | 
|  | 212     f = file(outfname,'w') | 
|  | 213     res.append('') | 
|  | 214     f.write('\n'.join(res)) | 
|  | 215     f.close() | 
|  | 216 | 
|  | 217 | 
|  | 218 | 
|  | 219 | 
|  | 220 if __name__ == "__main__": | 
|  | 221     """ | 
|  | 222     # called as | 
|  | 223     <command interpreter="python"> | 
|  | 224         rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name" | 
|  | 225         '$out_file1' '$logf' '$logf.files_path' '$gffout' | 
|  | 226     </command>    </command> | 
|  | 227     """ | 
|  | 228     if len(sys.argv) < 7: | 
|  | 229        s = 'rgCaCo.py needs 6 params - got %s \n' % (sys.argv) | 
|  | 230        print >> sys.stdout, s | 
|  | 231        sys.exit(0) | 
|  | 232     bfname = sys.argv[1] | 
|  | 233     name = sys.argv[2] | 
|  | 234     killme = string.punctuation + string.whitespace | 
|  | 235     trantab = string.maketrans(killme,'_'*len(killme)) | 
|  | 236     name = name.translate(trantab) | 
|  | 237     outfname = sys.argv[3] | 
|  | 238     logf = sys.argv[4] | 
|  | 239     logoutdir = sys.argv[5] | 
|  | 240     gffout = sys.argv[6] | 
|  | 241     topn = 1000 | 
|  | 242     try: | 
|  | 243         os.makedirs(logoutdir) | 
|  | 244     except: | 
|  | 245         pass | 
|  | 246     map_file = None | 
|  | 247     me = sys.argv[0] | 
|  | 248     amapf = '%s.bim' % bfname # to decode map in xformModel | 
|  | 249     flog = file(logf,'w') | 
|  | 250     logme = [] | 
|  | 251     cdir = os.getcwd() | 
|  | 252     s = 'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\n' % (myversion,timenow()) | 
|  | 253     print >> sys.stdout, s # so will appear as blurb for file | 
|  | 254     logme.append(s) | 
|  | 255     if verbose: | 
|  | 256         s = 'rgCaCo.py:  bfname=%s, logf=%s, argv = %s\n' % (bfname, logf, sys.argv) | 
|  | 257         print >> sys.stdout, s # so will appear as blurb for file | 
|  | 258         logme.append(s) | 
|  | 259     twd = tempfile.mkdtemp(suffix='rgCaCo') # make sure plink doesn't spew log file into the root! | 
|  | 260     tname = os.path.join(twd,name) | 
|  | 261     vcl = [plinke,'--noweb','--bfile',bfname,'--out',name,'--model'] | 
|  | 262     p=subprocess.Popen(' '.join(vcl),shell=True,stdout=flog,cwd=twd) | 
|  | 263     retval = p.wait() | 
|  | 264     resf = '%s.model' % tname # plink output is here we hope | 
|  | 265     xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file | 
|  | 266     makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgCaCo_TopTable',description=name,topn=topn) | 
|  | 267     flog.write('\n'.join(logme)) | 
|  | 268     flog.close() # close the log used | 
|  | 269     #shutil.copytree(twd,logoutdir) | 
|  | 270     shutil.rmtree(twd) # clean up | 
|  | 271 |