Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgTDT.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/local/bin/python | |
| 2 # hack to run and process a plink tdt | |
| 3 # expects args as | |
| 4 # bfilepath outname jobname outformat (wig,xls) | |
| 5 # ross lazarus | |
| 6 # for wig files, we need annotation so look for map file or complain | |
| 7 | |
| 8 """ | |
| 9 Parameters for wiggle track definition lines | |
| 10 All options are placed in a single line separated by spaces: | |
| 11 | |
| 12 track type=wiggle_0 name=track_label description=center_label \ | |
| 13 visibility=display_mode color=r,g,b altColor=r,g,b \ | |
| 14 priority=priority autoScale=on|off \ | |
| 15 gridDefault=on|off maxHeightPixels=max:default:min \ | |
| 16 graphType=bar|points viewLimits=lower:upper \ | |
| 17 yLineMark=real-value yLineOnOff=on|off \ | |
| 18 windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 | |
| 19 """ | |
| 20 | |
| 21 import sys,math,shutil,subprocess,os,time,tempfile,shutil,string | |
| 22 from os.path import abspath | |
| 23 from optparse import OptionParser | |
| 24 from rgutils import timenow, plinke | |
| 25 myversion = 'v0.003 January 2010' | |
| 26 verbose = False | |
| 27 | |
| 28 | |
| 29 | |
| 30 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): | |
| 31 """ | |
| 32 score must be scaled to 0-1000 | |
| 33 | |
| 34 Want to make some wig tracks from each analysis | |
| 35 Best n -log10(p). Make top hit the window. | |
| 36 we use our tab output which has | |
| 37 rs chrom offset ADD_stat ADD_p ADD_log10p | |
| 38 rs3094315 1 792429 1.151 0.2528 0.597223 | |
| 39 | |
| 40 """ | |
| 41 | |
| 42 def is_number(s): | |
| 43 try: | |
| 44 float(s) | |
| 45 return True | |
| 46 except ValueError: | |
| 47 return False | |
| 48 header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) | |
| 49 column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] | |
| 50 halfwidth=100 | |
| 51 resfpath = os.path.join(twd,resf) | |
| 52 resf = open(resfpath,'r') | |
| 53 resfl = resf.readlines() # dumb but convenient for millions of rows | |
| 54 resfl = [x.split() for x in resfl] | |
| 55 headl = resfl[0] | |
| 56 resfl = resfl[1:] | |
| 57 headl = [x.strip().upper() for x in headl] | |
| 58 headIndex = dict(zip(headl,range(0,len(headl)))) | |
| 59 # s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' | |
| 60 chrpos = headIndex.get('CHROM',None) | |
| 61 rspos = headIndex.get('RS',None) | |
| 62 offspos = headIndex.get('OFFSET',None) | |
| 63 ppos = headIndex.get('-LOG10TDTP',None) | |
| 64 wewant = [chrpos,rspos,offspos,ppos] | |
| 65 if None in wewant: # missing something | |
| 66 logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) | |
| 67 return | |
| 68 resfl = [x for x in resfl if x[ppos] > ''] | |
| 69 resfl = [(float(x[ppos]),x) for x in resfl] # decorate | |
| 70 resfl.sort() | |
| 71 resfl.reverse() # using -log10 so larger is better | |
| 72 resfl = resfl[:topn] # truncate | |
| 73 pvals = [x[0] for x in resfl] # need to scale | |
| 74 resfl = [x[1] for x in resfl] # drop decoration | |
| 75 maxp = max(pvals) # need to scale | |
| 76 minp = min(pvals) | |
| 77 prange = abs(maxp-minp) + 0.5 # fudge | |
| 78 scalefact = 1000.0/prange | |
| 79 logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) | |
| 80 for i,row in enumerate(resfl): | |
| 81 row[ppos] = '%d' % (int(scalefact*pvals[i])) | |
| 82 resfl[i] = row # replace | |
| 83 outf = file(outfname,'w') | |
| 84 outf.write(header) | |
| 85 outres = [] # need to resort into chrom offset order | |
| 86 for i,lrow in enumerate(resfl): | |
| 87 chrom,snp,offset,p, = [lrow[x] for x in wewant] | |
| 88 gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth), | |
| 89 '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) | |
| 90 outres.append(gff) | |
| 91 outres = [(x[0],int(x[3]),x) for x in outres] # decorate | |
| 92 outres.sort() # into chrom offset | |
| 93 outres=[x[2] for x in outres] # undecorate | |
| 94 outres = ['\t'.join(x) for x in outres] | |
| 95 outf.write('\n'.join(outres)) | |
| 96 outf.write('\n') | |
| 97 outf.close() | |
| 98 | |
| 99 | |
| 100 | |
| 101 def xformTDT(infname='',resf='',outfname='',name='foo',mapf='/usr/local/galaxy/data/rg/lped/x.bim'): | |
| 102 """munge a plink .tdt file into either a ucsc track or an xls file | |
| 103 CHR SNP A1:A2 T:U_TDT OR_TDT CHISQ_TDT P_TDT | |
| 104 0 MitoT217C 2:3 0:0 NA NA NA | |
| 105 0 MitoG228A 1:4 0:0 NA NA NA | |
| 106 0 MitoT250C 2:3 0:0 NA NA NA | |
| 107 map file has | |
| 108 1 rs4378174 0 003980745 | |
| 109 1 rs10796404 0 005465256 | |
| 110 1 rs2697965 0 014023092 | |
| 111 | |
| 112 grrr! | |
| 113 Changed in 1.01 to | |
| 114 [rerla@hg fresh]$ head database/job_working_directory/445/rgTDT.tdt | |
| 115 CHR SNP BP A1 A2 T U OR CHISQ P | |
| 116 1 rs12562034 758311 1 3 71 79 0.8987 0.4267 0.5136 | |
| 117 1 rs3934834 995669 4 2 98 129 0.7597 4.233 0.03963 | |
| 118 | |
| 119 | |
| 120 """ | |
| 121 if verbose: | |
| 122 print 'Rgenetics Galaxy Tools, rgTDT.py.xformTDT got resf=%s, outtype=%s, outfname=%s' % (resf,outtype,outfname) | |
| 123 wewantcols = ['SNP','CHR','BP','A1','A2','T','U','OR','CHISQ','P'] | |
| 124 res = [] | |
| 125 s = 'rs\tchrom\toffset\ta1\ta2\ttransmitted\tuntransmitted\tTDTChiSq\tTDTp\t-log10TDTp\tAbsTDTOR\tTDTOR' # header | |
| 126 res.append(s) | |
| 127 rsdict = {} | |
| 128 if not mapf: | |
| 129 sys.stderr.write('rgTDT called but no map file provided - cannot determine locations') | |
| 130 sys.exit(1) | |
| 131 map = file(mapf,'r') | |
| 132 for l in map: # plink map | |
| 133 ll = l.strip().split() | |
| 134 if len(ll) >= 3: | |
| 135 rs=ll[1].strip() | |
| 136 chrom = ll[0] | |
| 137 if chrom.lower() == 'x': | |
| 138 chrom = '23' | |
| 139 if chrom.lower() == 'y': | |
| 140 chrom = '24' | |
| 141 if chrom.lower() == 'mito': | |
| 142 chrom = '25' | |
| 143 offset = ll[3] | |
| 144 rsdict[rs] = (chrom,offset) | |
| 145 f = open(resf,'r') | |
| 146 headl = f.next().strip() | |
| 147 headl = headl.split() | |
| 148 wewant = [headl.index(x) for x in wewantcols] | |
| 149 llen = len(headl) | |
| 150 lnum = anum = 0 | |
| 151 for l in f: | |
| 152 lnum += 1 | |
| 153 ll = l.strip().split() | |
| 154 if len(ll) >= llen: # valid line | |
| 155 snp,chrom,offset,a1,a2,t,u,orat,chisq,p = [ll[x] for x in wewant] | |
| 156 if chisq == 'NA' or p == 'NA' or orat == 'NA': | |
| 157 continue # can't use these lines - gg gets unhappy | |
| 158 snp = snp.strip() | |
| 159 lp = '0.0' | |
| 160 fp = '1.0' | |
| 161 fakeorat = '1.0' | |
| 162 if p.upper().strip() <> 'NA': | |
| 163 try: | |
| 164 fp = float(p) | |
| 165 if fp <> 0: | |
| 166 lp = '%6f' % -math.log10(fp) | |
| 167 fp = '%6f' % fp | |
| 168 except: | |
| 169 pass | |
| 170 else: | |
| 171 p = '1.0' | |
| 172 if orat.upper().strip() <> 'NA': | |
| 173 try: | |
| 174 fakeorat = orat | |
| 175 if float(orat) < 1.0: | |
| 176 fakeorat = '%6f' % (1.0/float(orat)) # invert so large values big | |
| 177 except: | |
| 178 pass | |
| 179 else: | |
| 180 orat = '1.0' | |
| 181 outl = '\t'.join([snp,chrom,offset,a1,a2,t,u,chisq,p,lp,fakeorat,orat]) | |
| 182 res.append(outl) | |
| 183 f = file(outfname,'w') | |
| 184 res.append('') | |
| 185 f.write('\n'.join(res)) | |
| 186 f.close() | |
| 187 | |
| 188 | |
| 189 if __name__ == "__main__": | |
| 190 """ called as | |
| 191 <command interpreter="python"> | |
| 192 rgTDT.py -i '$infile.extra_files_path/$infile.metadata.base_name' -o '$title' -f '$outformat' -r '$out_file1' -l '$logf' -x '${GALAXY_DATA_INDEX_DIR}/rg/bin/pl$ | |
| 193 | |
| 194 </command> | |
| 195 | |
| 196 """ | |
| 197 u = """ called in xml as | |
| 198 <command interpreter="python2.4"> | |
| 199 rgTDT.py -i $i -o $out_prefix -f $outformat -r $out_file1 -l $logf | |
| 200 </command> | |
| 201 """ | |
| 202 if len(sys.argv) < 6: | |
| 203 s = '## Error rgTDT.py needs 5 command line params - got %s \n' % (sys.argv) | |
| 204 if verbose: | |
| 205 print >> sys.stdout, s | |
| 206 sys.exit(0) | |
| 207 parser = OptionParser(usage=u, version="%prog 0.01") | |
| 208 a = parser.add_option | |
| 209 a("-i","--infile",dest="bfname") | |
| 210 a("-o","--oprefix",dest="oprefix") | |
| 211 a("-f","--formatOut",dest="outformat") | |
| 212 a("-r","--results",dest="outfname") | |
| 213 a("-l","--logfile",dest="logf") | |
| 214 a("-d","--du",dest="uId") | |
| 215 a("-e","--email",dest="uEmail") | |
| 216 a("-g","--gff",dest="gffout",default="") | |
| 217 (options,args) = parser.parse_args() | |
| 218 killme = string.punctuation + string.whitespace | |
| 219 trantab = string.maketrans(killme,'_'*len(killme)) | |
| 220 title = options.oprefix | |
| 221 title = title.translate(trantab) | |
| 222 map_file = '%s.bim' % (options.bfname) # | |
| 223 me = sys.argv[0] | |
| 224 alogf = options.logf # absolute paths | |
| 225 od = os.path.split(alogf)[0] | |
| 226 try: | |
| 227 os.path.makedirs(od) | |
| 228 except: | |
| 229 pass | |
| 230 aoutf = options.outfname # absolute paths | |
| 231 od = os.path.split(aoutf)[0] | |
| 232 try: | |
| 233 os.path.makedirs(od) | |
| 234 except: | |
| 235 pass | |
| 236 vcl = [plinke,'--noweb', '--bfile',options.bfname,'--out',title,'--mind','0.5','--tdt'] | |
| 237 logme = [] | |
| 238 if verbose: | |
| 239 s = 'Rgenetics %s http://rgenetics.org Galaxy Tools rgTDT.py started %s\n' % (myversion,timenow()) | |
| 240 print >> sys.stdout,s | |
| 241 logme.append(s) | |
| 242 s ='rgTDT.py: bfname=%s, logf=%s, argv = %s\n' % (options.bfname,alogf, sys.argv) | |
| 243 print >> sys.stdout,s | |
| 244 logme.append(s) | |
| 245 s = 'rgTDT.py: vcl=%s\n' % (' '.join(vcl)) | |
| 246 print >> sys.stdout,s | |
| 247 logme.append(s) | |
| 248 twd = tempfile.mkdtemp(suffix='rgTDT') # make sure plink doesn't spew log file into the root! | |
| 249 tname = os.path.join(twd,title) | |
| 250 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) | |
| 251 retval = p.wait() | |
| 252 shutil.copy('%s.log' % tname,alogf) | |
| 253 sto = file(alogf,'a') | |
| 254 sto.write('\n'.join(logme)) | |
| 255 resf = '%s.tdt' % tname # plink output is here we hope | |
| 256 xformTDT(options.bfname,resf,aoutf,title,map_file) # leaves the desired summary file | |
| 257 gffout = options.gffout | |
| 258 if gffout > '': | |
| 259 makeGFF(resf=aoutf,outfname=gffout,logf=sto,twd='.',name='rgTDT_Top_Table',description=title,topn=1000) | |
| 260 shutil.rmtree(twd) | |
| 261 sto.close() | |
| 262 | |
| 263 | |
| 264 |
