Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgfakePed.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 # modified may 2011 to name components (map/ped) as RgeneticsData to align with default base_name | |
| 2 # otherwise downstream tools fail | |
| 3 # modified march 2011 to remove post execution hook | |
| 4 # pedigree data faker | |
| 5 # specifically designed for scalability testing of | |
| 6 # Shaun Purcel's PLINK package | |
| 7 # derived from John Ziniti's original suggestion | |
| 8 # allele frequency spectrum and random mating added | |
| 9 # ross lazarus me fecit january 13 2007 | |
| 10 # copyright ross lazarus 2007 | |
| 11 # without psyco | |
| 12 # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so. | |
| 13 # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate | |
| 14 # psyco makes it literally twice as quick!! | |
| 15 # all rights reserved except as granted under the terms of the LGPL | |
| 16 # see http://www.gnu.org/licenses/lgpl.html | |
| 17 # for a copy of the license you receive with this software | |
| 18 # and for your rights and obligations | |
| 19 # especially if you wish to modify or redistribute this code | |
| 20 # january 19 added random missingness inducer | |
| 21 # currently about 15M genos/minute without psyco, 30M/minute with | |
| 22 # so a billion genos should take about 40 minutes with psyco or 80 without... | |
| 23 # added mendel error generator jan 23 rml | |
| 24 | |
| 25 | |
| 26 import random,sys,time,os,string | |
| 27 | |
| 28 from optparse import OptionParser | |
| 29 | |
| 30 defbasename="RgeneticsData" | |
| 31 width = 500000 | |
| 32 ALLELES = ['1','2','3','4'] | |
| 33 prog = os.path.split(sys.argv[0])[-1] | |
| 34 debug = 0 | |
| 35 | |
| 36 """Natural-order sorting, supporting embedded numbers. | |
| 37 # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html | |
| 38 note test code there removed to conserve brain space | |
| 39 foo9bar2 < foo10bar2 < foo10bar10 | |
| 40 | |
| 41 """ | |
| 42 import random, re, sys | |
| 43 | |
| 44 def natsort_key(item): | |
| 45 chunks = re.split('(\d+(?:\.\d+)?)', item) | |
| 46 for ii in range(len(chunks)): | |
| 47 if chunks[ii] and chunks[ii][0] in '0123456789': | |
| 48 if '.' in chunks[ii]: numtype = float | |
| 49 else: numtype = int | |
| 50 # wrap in tuple with '0' to explicitly specify numbers come first | |
| 51 chunks[ii] = (0, numtype(chunks[ii])) | |
| 52 else: | |
| 53 chunks[ii] = (1, chunks[ii]) | |
| 54 return (chunks, item) | |
| 55 | |
| 56 def natsort(seq): | |
| 57 "Sort a sequence of text strings in a reasonable order." | |
| 58 alist = [item for item in seq] | |
| 59 alist.sort(key=natsort_key) | |
| 60 return alist | |
| 61 | |
| 62 | |
| 63 def makeUniformMAFdist(low=0.02, high=0.5): | |
| 64 """Fake a non-uniform maf distribution to make the data | |
| 65 more interesting. Provide uniform 0.02-0.5 distribution""" | |
| 66 MAFdistribution = [] | |
| 67 for i in xrange(int(100*low),int(100*high)+1): | |
| 68 freq = i/100.0 # uniform | |
| 69 MAFdistribution.append(freq) | |
| 70 return MAFdistribution | |
| 71 | |
| 72 def makeTriangularMAFdist(low=0.02, high=0.5, beta=5): | |
| 73 """Fake a non-uniform maf distribution to make the data | |
| 74 more interesting - more rare alleles """ | |
| 75 MAFdistribution = [] | |
| 76 for i in xrange(int(100*low),int(100*high)+1): | |
| 77 freq = (51 - i)/100.0 # large numbers of small allele freqs | |
| 78 for j in range(beta*i): # or i*i for crude exponential distribution | |
| 79 MAFdistribution.append(freq) | |
| 80 return MAFdistribution | |
| 81 | |
| 82 def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000): | |
| 83 """header row | |
| 84 """ | |
| 85 res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))] | |
| 86 return ' '.join(res) | |
| 87 | |
| 88 def makeMap( width=500000, MAFdistribution=[], useGP=False): | |
| 89 """make snp allele and frequency tables for consistent generation""" | |
| 90 usegp = 1 | |
| 91 snpdb = 'snp126' | |
| 92 hgdb = 'hg18' | |
| 93 alleles = [] | |
| 94 freqs = [] | |
| 95 rslist = [] | |
| 96 chromlist = [] | |
| 97 poslist = [] | |
| 98 for snp in range(width): | |
| 99 random.shuffle(ALLELES) | |
| 100 alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles! | |
| 101 freqs.append(random.choice(MAFdistribution)) # more rare alleles | |
| 102 if useGP: | |
| 103 try: | |
| 104 import MySQLdb | |
| 105 genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3') | |
| 106 curs = genome.cursor() # use default cursor | |
| 107 except: | |
| 108 if debug: | |
| 109 print 'cannot connect to local copy of golden path' | |
| 110 usegp = 0 | |
| 111 if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated.... | |
| 112 curs.execute('use %s' % hgdb) | |
| 113 print 'Collecting %d real rs numbers - this may take a while' % width | |
| 114 # get a random draw of enough reasonable (hapmap) snps with frequency data | |
| 115 s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random' | |
| 116 group by name order by rand() limit %d''' % (snpdb,width) | |
| 117 curs.execute(s) | |
| 118 reslist = curs.fetchall() | |
| 119 reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr | |
| 120 reslist = natsort(reslist) | |
| 121 for s in reslist: | |
| 122 chrom,pos,rs = s.split('\t') | |
| 123 rslist.append(rs) | |
| 124 chromlist.append(chrom) | |
| 125 poslist.append(pos) | |
| 126 else: | |
| 127 chrom = '1' | |
| 128 for snp in range(width): | |
| 129 pos = '%d' % (1000*snp) | |
| 130 rs = 'rs00%d' % snp | |
| 131 rslist.append(rs) | |
| 132 chromlist.append(chrom) | |
| 133 poslist.append(pos) | |
| 134 return alleles,freqs, rslist, chromlist, poslist | |
| 135 | |
| 136 def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000): | |
| 137 """make a faked plink compatible map file - fbat files | |
| 138 have the map written as a header line""" | |
| 139 outf = '%s.map'% (fprefix) | |
| 140 outf = os.path.join(fpath,outf) | |
| 141 amap = open(outf, 'w') | |
| 142 res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))] | |
| 143 res.append('') | |
| 144 amap.write('\n'.join(res)) | |
| 145 amap.close() | |
| 146 | |
| 147 def makeMissing(genos=[], missrate = 0.03, missval = '0'): | |
| 148 """impose some random missingness""" | |
| 149 nsnps = len(genos) | |
| 150 for snp in range(nsnps): # ignore first 6 columns | |
| 151 if random.random() <= missrate: | |
| 152 genos[snp] = '%s %s' % (missval,missval) | |
| 153 return genos | |
| 154 | |
| 155 def makeTriomissing(genos=[], missrate = 0.03, missval = '0'): | |
| 156 """impose some random missingness on a trio - moth eaten like real data""" | |
| 157 for person in (0,1): | |
| 158 nsnps = len(genos[person]) | |
| 159 for snp in range(nsnps): | |
| 160 for person in [0,1,2]: | |
| 161 if random.random() <= missrate: | |
| 162 genos[person][snp] = '%s %s' % (missval,missval) | |
| 163 return genos | |
| 164 | |
| 165 | |
| 166 def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)): | |
| 167 """impose some random mendels on a trio | |
| 168 there are 8 of the 9 mating types we can simulate reasonable errors for | |
| 169 Note, since random mating dual het parents can produce any genotype we can't generate an interesting | |
| 170 error for them, so the overall mendel rate will be lower than mendrate, depending on | |
| 171 allele frequency...""" | |
| 172 if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het | |
| 173 return kiddip # cannot simulate a mendel error - anything is legal! | |
| 174 elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom | |
| 175 if p2g[0] == 0: # - make child p2 opposite hom for error | |
| 176 kiddip = (1,1) | |
| 177 else: | |
| 178 kiddip = (0,0) | |
| 179 elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom | |
| 180 if p1g[0] == 0: # - make child p1 opposite hom for error | |
| 181 kiddip = (1,1) | |
| 182 else: | |
| 183 kiddip = (0,0) | |
| 184 elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom | |
| 185 if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error | |
| 186 if random.random() <= 0.5: | |
| 187 kiddip = (0,1) | |
| 188 else: | |
| 189 if p1g[0] == 0: | |
| 190 kiddip = (1,1) | |
| 191 else: | |
| 192 kiddip = (0,0) | |
| 193 else: # parents are opposite hom - return any hom as an error | |
| 194 if random.random() <= 0.5: | |
| 195 kiddip = (0,0) | |
| 196 else: | |
| 197 kiddip = (1,1) | |
| 198 return kiddip | |
| 199 | |
| 200 | |
| 201 | |
| 202 | |
| 203 def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0): | |
| 204 """this family is a simple trio, constructed by random mating two random genotypes | |
| 205 TODO: why not generate from chromosomes - eg hapmap | |
| 206 set each haplotype locus according to the conditional | |
| 207 probability implied by the surrounding loci - eg use both neighboring pairs, triplets | |
| 208 and quads as observed in hapmap ceu""" | |
| 209 dadped = '%d 1 0 0 1 1 %s' | |
| 210 mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :) | |
| 211 kidped = '%d 3 1 2 %d %d %s' | |
| 212 family = [] # result accumulator | |
| 213 sex = random.choice((1,2)) # for the kid | |
| 214 affected = random.choice((1,2)) | |
| 215 genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles | |
| 216 # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles | |
| 217 for snp in xrange(width): | |
| 218 f = freqs[snp] | |
| 219 for i in range(2): # do dad and mum | |
| 220 p = random.random() | |
| 221 a1 = a2 = 0 | |
| 222 if p <= f: # a rare allele | |
| 223 a1 = 1 | |
| 224 p = random.random() | |
| 225 if p <= f: # a rare allele | |
| 226 a2 = 1 | |
| 227 if a1 > a2: | |
| 228 a1,a2 = a2,a1 # so ordering consistent - 00,01,11 | |
| 229 dip = (a1,a2) | |
| 230 genos[i].append(dip) # tuples of 0,1 | |
| 231 a1 = random.choice(genos[0][snp]) # dad gamete | |
| 232 a2 = random.choice(genos[1][snp]) # mum gamete | |
| 233 if a1 > a2: | |
| 234 a1,a2 = a2,a1 # so ordering consistent - 00,01,11 | |
| 235 kiddip = (a1,a2) # NSFW mating! | |
| 236 genos[2].append(kiddip) | |
| 237 if mendrate > 0: | |
| 238 if random.random() <= mendrate: | |
| 239 genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip) | |
| 240 achoice = alleles[snp] | |
| 241 for g in genos: # now convert to alleles using allele dict | |
| 242 a1 = achoice[g[snp][0]] # get allele letter | |
| 243 a2 = achoice[g[snp][1]] | |
| 244 g[snp] = '%s %s' % (a1,a2) | |
| 245 if missrate > 0: | |
| 246 genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval) | |
| 247 family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio | |
| 248 family.append(mumped % (trio,' '.join(genos[1]))) | |
| 249 family.append(kidped % (trio,sex,affected,' '.join(genos[2]))) | |
| 250 return family | |
| 251 | |
| 252 def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'): | |
| 253 """make an entire genotype vector for an independent subject""" | |
| 254 sex = random.choice((1,2)) | |
| 255 if not aff: | |
| 256 aff = random.choice((1,2)) | |
| 257 genos = [] #0/1 for common,rare initially, then xform to alleles | |
| 258 family = [] | |
| 259 personped = '%d 1 0 0 %d %d %s' | |
| 260 poly = (0,1) | |
| 261 for snp in xrange(width): | |
| 262 achoice = alleles[snp] | |
| 263 f = freqs[snp] | |
| 264 p = random.random() | |
| 265 a1 = a2 = 0 | |
| 266 if p <= f: # a rare allele | |
| 267 a1 = 1 | |
| 268 p = random.random() | |
| 269 if p <= f: # a rare allele | |
| 270 a2 = 1 | |
| 271 if a1 > a2: | |
| 272 a1,a2 = a2,a1 # so ordering consistent - 00,01,11 | |
| 273 a1 = achoice[a1] # get allele letter | |
| 274 a2 = achoice[a2] | |
| 275 g = '%s %s' % (a1,a2) | |
| 276 genos.append(g) | |
| 277 if missrate > 0.0: | |
| 278 genos = makeMissing(genos=genos,missrate=missrate, missval=missval) | |
| 279 family.append(personped % (id,sex,aff,' '.join(genos))) | |
| 280 return family | |
| 281 | |
| 282 def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={}, | |
| 283 alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'): | |
| 284 """ fake a hapmap file and a pedigree file for eg haploview | |
| 285 this is arranged as the transpose of a ped file - cols are subjects, rows are markers | |
| 286 so we need to generate differently since we can't do the transpose in ram reliably for | |
| 287 a few billion genotypes... | |
| 288 """ | |
| 289 outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s' | |
| 290 cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", | |
| 291 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"] | |
| 292 yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1", | |
| 293 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"] | |
| 294 sampids = ids | |
| 295 if trios: | |
| 296 ts = '%d trios' % int(nsubj/3.) | |
| 297 else: | |
| 298 ts = '%d unrelated subjects' % nsubj | |
| 299 res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),] | |
| 300 res.append('# ross lazarus me fecit') | |
| 301 res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts | |
| 302 outf = open('%s.hmap' % (fprefix), 'w') | |
| 303 started = time.time() | |
| 304 if trios: | |
| 305 ntrios = int(nsubj/3.) | |
| 306 for n in ntrios: # each is a dict | |
| 307 row = copy.copy(cfake5) # get first fields | |
| 308 row = map(str,row) | |
| 309 if race == "YRI": | |
| 310 row += yfake5 | |
| 311 elif race == 'CEU': | |
| 312 row += cfake5 | |
| 313 else: | |
| 314 row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode | |
| 315 row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order | |
| 316 res.append(' '.join(row)) | |
| 317 res.append('') | |
| 318 outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000) | |
| 319 f = file(outfname,'w') | |
| 320 f.write('\n'.join(res)) | |
| 321 f.close() | |
| 322 print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname) | |
| 323 | |
| 324 | |
| 325 def makePed(fprefix= 'fakebigped', fpath='./', | |
| 326 width=500000, nsubj=2000, MAFdistribution=[],alleles={}, | |
| 327 freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''): | |
| 328 """fake trios with mendel consistent random mating genotypes in offspring | |
| 329 with consistent alleles and MAFs for the sample""" | |
| 330 res = [] | |
| 331 if fbatstyle: # add a header row with the marker names | |
| 332 res.append(fbathead) # header row for fbat | |
| 333 outfname = '%s.ped'% (fprefix) | |
| 334 outfname = os.path.join(fpath,outfname) | |
| 335 outf = open(outfname,'w') | |
| 336 ntrios = int(nsubj/3.) | |
| 337 outf = open(outfile, 'w') | |
| 338 started = time.time() | |
| 339 for trio in xrange(ntrios): | |
| 340 family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio, | |
| 341 missrate = missrate, mendrate=mendrate, missval=missval) | |
| 342 res += family | |
| 343 if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable | |
| 344 if (trio + 1) % 50 == 0: # show progress | |
| 345 dur = time.time() - started | |
| 346 if dur == 0: | |
| 347 dur = 1.0 | |
| 348 print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur) | |
| 349 outf.write('\n'.join(res)) | |
| 350 outf.write('\n') | |
| 351 res = [] | |
| 352 if len(res) > 0: # some left | |
| 353 outf.write('\n'.join(res)) | |
| 354 outf.write('\n') | |
| 355 outf.close() | |
| 356 if debug: | |
| 357 print '##makeped : %6.1f seconds total runtime' % (time.time() - started) | |
| 358 | |
| 359 def makeIndep(fprefix = 'fakebigped', fpath='./', | |
| 360 width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[], | |
| 361 alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''): | |
| 362 """fake a random sample from a random mating sample | |
| 363 with consistent alleles and MAFs""" | |
| 364 res = [] | |
| 365 Ntot = Nunaff + Naff | |
| 366 status = [1,]*Nunaff | |
| 367 status += [2,]*Nunaff | |
| 368 outf = '%s.ped' % (fprefix) | |
| 369 outf = os.path.join(fpath,outf) | |
| 370 outf = open(outf, 'w') | |
| 371 started = time.time() | |
| 372 #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot) | |
| 373 if fbatstyle: # add a header row with the marker names | |
| 374 res.append(fbathead) # header row for fbat | |
| 375 for id in xrange(Ntot): | |
| 376 if id < Nunaff: | |
| 377 aff = 1 | |
| 378 else: | |
| 379 aff = 2 | |
| 380 family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1) | |
| 381 res += family | |
| 382 if (id % 50 == 0): # write out to keep ram requirements reasonable | |
| 383 if (id % 200 == 0): # show progress | |
| 384 dur = time.time() - started | |
| 385 if dur == 0: | |
| 386 dur = 1.0 | |
| 387 print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur) | |
| 388 outf.write('\n'.join(res)) | |
| 389 outf.write('\n') | |
| 390 res = [] | |
| 391 if len(res) > 0: # some left | |
| 392 outf.write('\n'.join(res)) | |
| 393 outf.write('\n') | |
| 394 outf.close() | |
| 395 print '## makeindep: %6.1f seconds total runtime' % (time.time() - started) | |
| 396 | |
| 397 u = """ | |
| 398 Generate either trios or independent subjects with a prespecified | |
| 399 number of random alleles and a uniform or triangular MAF distribution for | |
| 400 stress testing. No LD is simulated - alleles are random. Offspring for | |
| 401 trios are generated by random mating the random parental alleles so there are | |
| 402 no Mendelian errors unless the -M option is used. Mendelian errors are generated | |
| 403 randomly according to the possible errors given the parental mating type although | |
| 404 this is fresh code and not guaranteed to work quite right yet - comments welcomed | |
| 405 | |
| 406 Enquiries to ross.lazarus@gmail.com | |
| 407 | |
| 408 eg to generate 700 trios with 500k snps, use: | |
| 409 fakebigped.py -n 2100 -s 500000 | |
| 410 or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use: | |
| 411 fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02 | |
| 412 | |
| 413 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 | |
| 414 will make fbat compatible myfake.ped with 100k markers in | |
| 415 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing | |
| 416 | |
| 417 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05 | |
| 418 will make fbat compatible myfake.ped with 100k markers in | |
| 419 666 trios (total close to 2000 subjects), a uniform MAF distribution, | |
| 420 about 5% Mendelian errors and about 5% MCAR missing | |
| 421 | |
| 422 | |
| 423 fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l | |
| 424 will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does), | |
| 425 with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively), | |
| 426 a triangular MAF distribution (more rare alleles) and about 5% MCAR missing | |
| 427 | |
| 428 You should see about 1/4 million genotypes/second so about an hour for a | |
| 429 500k snps in 2k subjects and about a 4GB ped file - these are BIG!! | |
| 430 | |
| 431 """ | |
| 432 | |
| 433 import sys, os, glob | |
| 434 | |
| 435 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> | |
| 436 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
| 437 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 438 <head> | |
| 439 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 440 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" /> | |
| 441 <title></title> | |
| 442 <link rel="stylesheet" href="/static/style/base.css" type="text/css" /> | |
| 443 </head> | |
| 444 <body> | |
| 445 <div class="document"> | |
| 446 """ | |
| 447 | |
| 448 | |
| 449 def doImport(outfile=None,outpath=None): | |
| 450 """ import into one of the new html composite data types for Rgenetics | |
| 451 Dan Blankenberg with mods by Ross Lazarus | |
| 452 October 2007 | |
| 453 """ | |
| 454 flist = glob.glob(os.path.join(outpath,'*')) | |
| 455 outf = open(outfile,'w') | |
| 456 outf.write(galhtmlprefix % prog) | |
| 457 for i, data in enumerate( flist ): | |
| 458 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) | |
| 459 outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>') | |
| 460 outf.write('%s called with command line:<br><pre>' % prog) | |
| 461 outf.write(' '.join(sys.argv)) | |
| 462 outf.write('\n</pre>\n') | |
| 463 outf.write("</div></body></html>") | |
| 464 outf.close() | |
| 465 | |
| 466 | |
| 467 | |
| 468 if __name__ == "__main__": | |
| 469 """ | |
| 470 """ | |
| 471 parser = OptionParser(usage=u, version="%prog 0.01") | |
| 472 a = parser.add_option | |
| 473 a("-n","--nsubjects",type="int",dest="Ntot", | |
| 474 help="nsubj: total number of subjects",default=2000) | |
| 475 a("-t","--title",dest="title", | |
| 476 help="title: file basename for outputs",default='fakeped') | |
| 477 a("-c","--cases",type="int",dest="Naff", | |
| 478 help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0) | |
| 479 a("-s","--snps",dest="width",type="int", | |
| 480 help="snps: total number of snps per subject", default=1000) | |
| 481 a("-d","--distribution",dest="MAFdist",default="Uniform", | |
| 482 help="MAF distribution - default is Uniform, can be Triangular") | |
| 483 a("-o","--outf",dest="outf", | |
| 484 help="Output file", default = 'fakeped') | |
| 485 a("-p","--outpath",dest="outpath", | |
| 486 help="Path for output files", default = './') | |
| 487 a("-l","--pLink",dest="outstyle", default='L', | |
| 488 help="Ped files as for Plink - no header, separate Map file - default is Plink style") | |
| 489 a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)") | |
| 490 a("-m","--missing",dest="missrate",type="float", | |
| 491 help="missing: probability of missing MCAR - default 0.0", default=0.0) | |
| 492 a("-v","--valmiss",dest="missval", | |
| 493 help="missing character: Missing allele code - usually 0 or N - default 0", default="0") | |
| 494 a("-M","--Mendelrate",dest="mendrate",type="float", | |
| 495 help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0) | |
| 496 a("-H","--noHGRS",dest="useHG",type="int", | |
| 497 help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True) | |
| 498 (options,args) = parser.parse_args() | |
| 499 low = options.lowmaf | |
| 500 try: | |
| 501 os.makedirs(options.outpath) | |
| 502 except: | |
| 503 pass | |
| 504 if options.MAFdist.upper() == 'U': | |
| 505 mafDist = makeUniformMAFdist(low=low, high=0.5) | |
| 506 else: | |
| 507 mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5) | |
| 508 alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width), | |
| 509 MAFdistribution=mafDist, useGP=False) | |
| 510 fbathead = [] | |
| 511 s = string.whitespace+string.punctuation | |
| 512 trantab = string.maketrans(s,'_'*len(s)) | |
| 513 title = string.translate(options.title,trantab) | |
| 514 | |
| 515 if options.outstyle == 'F': | |
| 516 fbatstyle = True | |
| 517 fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width) | |
| 518 else: | |
| 519 fbatstyle = False | |
| 520 writeMap(fprefix=defbasename, rslist=rslist, fpath=options.outpath, | |
| 521 chromlist=chromlist, poslist=poslist, width=options.width) | |
| 522 if options.Naff > 0: # make case control data | |
| 523 makeIndep(fprefix = defbasename, fpath=options.outpath, | |
| 524 width=options.width, Nunaff=options.Ntot-options.Naff, | |
| 525 Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs, | |
| 526 fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval, | |
| 527 fbathead=fbathead) | |
| 528 else: | |
| 529 makePed(fprefix=defbasename, fpath=options.fpath, | |
| 530 width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot, | |
| 531 alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate, | |
| 532 mendrate=options.mendrate, missval=options.missval, | |
| 533 fbathead=fbathead) | |
| 534 doImport(outfile=options.outf,outpath=options.outpath) | |
| 535 | |
| 536 | |
| 537 |
