| 0 | 1 #!/usr/local/bin/python | 
|  | 2 """ | 
|  | 3 # added most of the available options for linear models | 
|  | 4 # june 2009 rml | 
|  | 5 # hack to run and process a plink quantitative trait | 
|  | 6 # | 
|  | 7 | 
|  | 8 This is a wrapper for Shaun Purcell's Plink linear/logistic models for | 
|  | 9 traits, covariates and genotypes. | 
|  | 10 | 
|  | 11 It requires some judgement to interpret the findings | 
|  | 12 We need some better visualizations - manhattan plots are good. | 
|  | 13 svg with rs numbers for top 1%? | 
|  | 14 | 
|  | 15 toptable tools - truncate a gg file down to some low percentile | 
|  | 16 | 
|  | 17 intersect with other tables - eg gene expression regressions on snps | 
|  | 18 | 
|  | 19 | 
|  | 20 | 
|  | 21 """ | 
|  | 22 | 
|  | 23 import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands | 
|  | 24 from rgutils import plinke | 
|  | 25 | 
|  | 26 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): | 
|  | 27     """ | 
|  | 28     score must be scaled to 0-1000 | 
|  | 29 | 
|  | 30     Want to make some wig tracks from each analysis | 
|  | 31     Best n -log10(p). Make top hit the window. | 
|  | 32     we use our tab output which has | 
|  | 33     rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p | 
|  | 34     rs3094315	1	792429	1.151	0.2528	0.597223 | 
|  | 35 | 
|  | 36     """ | 
|  | 37 | 
|  | 38     def is_number(s): | 
|  | 39         try: | 
|  | 40             float(s) | 
|  | 41             return True | 
|  | 42         except ValueError: | 
|  | 43             return False | 
|  | 44     header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) | 
|  | 45     column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] | 
|  | 46     halfwidth=100 | 
|  | 47     resfpath = os.path.join(twd,resf) | 
|  | 48     resf = open(resfpath,'r') | 
|  | 49     resfl = resf.readlines() # dumb but convenient for millions of rows | 
|  | 50     resfl = [x.split() for x in resfl] | 
|  | 51     headl = resfl[0] | 
|  | 52     resfl = resfl[1:] | 
|  | 53     headl = [x.strip().upper() for x in headl] | 
|  | 54     headIndex = dict(zip(headl,range(0,len(headl)))) | 
|  | 55     chrpos = headIndex.get('CHROM',None) | 
|  | 56     rspos = headIndex.get('RS',None) | 
|  | 57     offspos = headIndex.get('OFFSET',None) | 
|  | 58     ppos = headIndex.get('ADD_LOG10P',None) | 
|  | 59     wewant = [chrpos,rspos,offspos,ppos] | 
|  | 60     if None in wewant: # missing something | 
|  | 61        logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) | 
|  | 62        return | 
|  | 63     resfl = [x for x in resfl if x[ppos] > ''] | 
|  | 64     resfl = [(float(x[ppos]),x) for x in resfl] # decorate | 
|  | 65     resfl.sort() | 
|  | 66     resfl.reverse() # using -log10 so larger is better | 
|  | 67     resfl = resfl[:topn] # truncate | 
|  | 68     pvals = [x[0] for x in resfl] # need to scale | 
|  | 69     resfl = [x[1] for x in resfl] # drop decoration | 
|  | 70     if len(pvals) == 0: | 
|  | 71         logf.write('### no pvalues found in resfl - %s' % (resfl[:3])) | 
|  | 72         sys.exit(1) | 
|  | 73     maxp = max(pvals) # need to scale | 
|  | 74     minp = min(pvals) | 
|  | 75     prange = abs(maxp-minp) + 0.5 # fudge | 
|  | 76     scalefact = 1000.0/prange | 
|  | 77     logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) | 
|  | 78     for i,row in enumerate(resfl): | 
|  | 79         row[ppos] = '%d' % (int(scalefact*pvals[i])) | 
|  | 80         resfl[i] = row # replace | 
|  | 81     outf = file(outfname,'w') | 
|  | 82     outf.write(header) | 
|  | 83     outres = [] # need to resort into chrom offset order | 
|  | 84     for i,lrow in enumerate(resfl): | 
|  | 85         chrom,snp,offset,p, = [lrow[x] for x in wewant] | 
|  | 86         gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), | 
|  | 87                '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) | 
|  | 88         outres.append(gff) | 
|  | 89     outres = [(x[0],int(x[3]),x) for x in outres] # decorate | 
|  | 90     outres.sort() # into chrom offset | 
|  | 91     outres=[x[2] for x in outres] # undecorate | 
|  | 92     outres = ['\t'.join(x) for x in outres] | 
|  | 93     outf.write('\n'.join(outres)) | 
|  | 94     outf.write('\n') | 
|  | 95     outf.close() | 
|  | 96 | 
|  | 97 | 
|  | 98 | 
|  | 99 def xformQassoc(resf='',outfname='',logf=None,twd='.'): | 
|  | 100     """	plink.assoc.linear to gg file | 
|  | 101 from the docs | 
|  | 102 The output per each SNP might look something like: | 
|  | 103 | 
|  | 104     CHR        SNP      BP  A1       TEST   NMISS       OR      STAT         P | 
|  | 105       5   rs000001   10001   A        ADD     664   0.7806    -1.942   0.05216 | 
|  | 106       5   rs000001   10001   A     DOMDEV     664   0.9395   -0.3562    0.7217 | 
|  | 107       5   rs000001   10001   A       COV1     664   0.9723   -0.7894    0.4299 | 
|  | 108       5   rs000001   10001   A       COV2     664    1.159    0.5132    0.6078 | 
|  | 109       5   rs000001   10001   A   GENO_2DF     664       NA     5.059    0.0797 | 
|  | 110     need to transform into gg columns for each distinct test | 
|  | 111     or bed for tracks? | 
|  | 112 | 
|  | 113     """ | 
|  | 114     logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname)) | 
|  | 115     resdict = {} | 
|  | 116     rsdict = {} | 
|  | 117     markerlist = [] | 
|  | 118     # plink is "clever" - will run logistic if only 2 categories such as gender | 
|  | 119     resfs = resf.split('.') | 
|  | 120     if resfs[-1] == 'logistic': | 
|  | 121         resfs[-1] = 'linear' | 
|  | 122     else: | 
|  | 123         resfs[-1] = 'logistic' | 
|  | 124     altresf = '.'.join(resfs) | 
|  | 125 | 
|  | 126     altresfpath = os.path.join(twd,altresf) | 
|  | 127     resfpath = os.path.join(twd,resf) | 
|  | 128     try: | 
|  | 129         resf = open(resfpath,'r') | 
|  | 130     except: | 
|  | 131         try: | 
|  | 132             resf = open(altresfpath,'r') | 
|  | 133         except: | 
|  | 134             print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath) | 
|  | 135             sys.exit(1) | 
|  | 136     for lnum,row in enumerate(resf): | 
|  | 137         if lnum == 0: | 
|  | 138             headl = row.split() | 
|  | 139             headl = [x.strip().upper() for x in headl] | 
|  | 140             headIndex = dict(zip(headl,range(0,len(headl)))) | 
|  | 141             chrpos = headIndex.get('CHR',None) | 
|  | 142             rspos = headIndex.get('SNP',None) | 
|  | 143             offspos = headIndex.get('BP',None) | 
|  | 144             nmisspos = headIndex.get('NMISS',None) | 
|  | 145             testpos = headIndex.get('TEST',None) | 
|  | 146             ppos = headIndex.get('P',None) | 
|  | 147             coeffpos = headIndex.get('OR',None) | 
|  | 148             if not coeffpos: | 
|  | 149                 coeffpos = headIndex.get('BETA',None) | 
|  | 150             apos = headIndex.get('A1',None) | 
|  | 151             statpos = headIndex.get('STAT',None) | 
|  | 152             wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos] | 
|  | 153             if None in wewant: # missing something | 
|  | 154                logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex) | 
|  | 155                return | 
|  | 156             llen = len(headl) | 
|  | 157         else: # no Nones! | 
|  | 158             ll = row.split() | 
|  | 159             if len(ll) >= llen: # valid line | 
|  | 160                 chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant] | 
|  | 161                 snp = snp.strip() | 
|  | 162                 if p <> 'NA' : | 
|  | 163                   try: | 
|  | 164                     ffp = float(p) | 
|  | 165                     if ffp <> 0: | 
|  | 166                        lp =  -math.log10(ffp) | 
|  | 167                   except: | 
|  | 168                     lp = 0.0 | 
|  | 169                   resdict.setdefault(test,{}) | 
|  | 170                   resdict[test][snp] = (stat,p,'%f' % lp) | 
|  | 171                   if rsdict.get(snp,None) == None: | 
|  | 172                       rsdict[snp] = (chrom,offset) | 
|  | 173                       markerlist.append(snp) | 
|  | 174     # now have various tests indexed by rs | 
|  | 175     tk = resdict.keys() | 
|  | 176     tk.sort() # tests | 
|  | 177     ohead = ['rs','chrom','offset'] | 
|  | 178     for t in tk: # add headers | 
|  | 179         ohead.append('%s_stat' % t) | 
|  | 180         ohead.append('%s_p' % t) | 
|  | 181         ohead.append('%s_log10p' % t) | 
|  | 182     oheads = '\t'.join(ohead) | 
|  | 183     res = [oheads,] | 
|  | 184     for snp in markerlist: # retain original order | 
|  | 185         chrom,offset = rsdict[snp] | 
|  | 186         outl = [snp,chrom,offset] | 
|  | 187         for t in tk: | 
|  | 188             outl += resdict[t][snp] # add stat,p for this test | 
|  | 189         outs = '\t'.join(outl) | 
|  | 190         res.append(outs) | 
|  | 191     f = file(outfname,'w') | 
|  | 192     res.append('') | 
|  | 193     f.write('\n'.join(res)) | 
|  | 194     f.close() | 
|  | 195 | 
|  | 196 | 
|  | 197 if __name__ == "__main__": | 
|  | 198     """ | 
|  | 199 | 
|  | 200     <command interpreter="python"> | 
|  | 201         rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name' | 
|  | 202         "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' | 
|  | 203         '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout' | 
|  | 204     </command> | 
|  | 205     """ | 
|  | 206     topn = 1000 | 
|  | 207     killme = string.punctuation+string.whitespace | 
|  | 208     trantab = string.maketrans(killme,'_'*len(killme)) | 
|  | 209     if len(sys.argv) < 17: | 
|  | 210        s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv) | 
|  | 211        sys.stderr.write(s) # print >>,s would probably also work? | 
|  | 212        sys.exit(0) | 
|  | 213     blurb = 'rgGLM.py called with %s' % sys.argv | 
|  | 214     print >> sys.stdout,blurb | 
|  | 215     bfname = sys.argv[1] | 
|  | 216     phename = sys.argv[2] | 
|  | 217     title = sys.argv[3] | 
|  | 218     title.translate(trantab) | 
|  | 219     predvar = sys.argv[4] | 
|  | 220     covar = sys.argv[5].strip() | 
|  | 221     outfname = sys.argv[6] | 
|  | 222     logfname = sys.argv[7] | 
|  | 223     op = os.path.split(logfname)[0] | 
|  | 224     try: # for test - needs this done | 
|  | 225         os.makedirs(op) | 
|  | 226     except: | 
|  | 227         pass | 
|  | 228     basename = sys.argv[8].translate(trantab) | 
|  | 229     inter = sys.argv[9] == '1' | 
|  | 230     cond = sys.argv[10].strip() | 
|  | 231     if cond == 'None': | 
|  | 232         cond = '' | 
|  | 233     gender = sys.argv[11] == '1' | 
|  | 234     mind = sys.argv[12] | 
|  | 235     geno = sys.argv[13] | 
|  | 236     maf = sys.argv[14] | 
|  | 237     logistic = sys.argv[15].strip()=='1' | 
|  | 238     gffout = sys.argv[16] | 
|  | 239     me = sys.argv[0] | 
|  | 240     phepath = '%s.pphe' % phename | 
|  | 241     twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root! | 
|  | 242     tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log | 
|  | 243     vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno', | 
|  | 244            phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno, | 
|  | 245            '--maf %s' % maf] | 
|  | 246     if logistic: | 
|  | 247         vcl.append('--logistic') | 
|  | 248         resf = '%s.assoc.logistic' % basename # plink output is here we hope | 
|  | 249     else: | 
|  | 250         vcl.append('--linear') | 
|  | 251         resf = '%s.assoc.linear' % basename # plink output is here we hope | 
|  | 252     resf = os.path.join(twd,resf) | 
|  | 253     if gender: | 
|  | 254         vcl.append('--sex') | 
|  | 255     if inter: | 
|  | 256         vcl.append('--interaction') | 
|  | 257     if covar > 'None': | 
|  | 258         vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates | 
|  | 259     tcfile = None | 
|  | 260     if len(cond) > 0: # plink wants these in a file.. | 
|  | 261         dummy,tcfile = tempfile.mkstemp(suffix='condlist') # | 
|  | 262         f = open(tcfile,'w') | 
|  | 263         cl = cond.split() | 
|  | 264         f.write('\n'.join(cl)) | 
|  | 265         f.write('\n') | 
|  | 266         f.close() | 
|  | 267         vcl.append('--condition-list %s' % tcfile) | 
|  | 268     p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) | 
|  | 269     retval = p.wait() | 
|  | 270     if tcfile: | 
|  | 271         os.unlink(tcfile) | 
|  | 272     plinklog = file(tplog,'r').read() | 
|  | 273     logf = file(logfname,'w') | 
|  | 274     logf.write(blurb) | 
|  | 275     logf.write('\n') | 
|  | 276     logf.write('vcl=%s\n' % vcl) | 
|  | 277     xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file | 
|  | 278     makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn) | 
|  | 279     logf.write('\n') | 
|  | 280     logf.write(plinklog) | 
|  | 281     logf.close() | 
|  | 282     #shutil.rmtree(twd) # clean up | 
|  | 283 | 
|  | 284 | 
|  | 285 | 
|  | 286 | 
|  | 287 |