annotate tools/rgenetics/rgfakePed.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 # modified may 2011 to name components (map/ped) as RgeneticsData to align with default base_name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 # otherwise downstream tools fail
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # modified march 2011 to remove post execution hook
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 # pedigree data faker
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # specifically designed for scalability testing of
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 # Shaun Purcel's PLINK package
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # derived from John Ziniti's original suggestion
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # allele frequency spectrum and random mating added
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 # ross lazarus me fecit january 13 2007
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 # copyright ross lazarus 2007
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 # without psyco
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 # generates about 10k snp genotypes in 2k subjects (666 trios) per minute or so.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 # so 500k (a billion genotypes), at about 4 trios/min will a couple of hours to generate
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 # psyco makes it literally twice as quick!!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 # all rights reserved except as granted under the terms of the LGPL
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 # see http://www.gnu.org/licenses/lgpl.html
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 # for a copy of the license you receive with this software
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 # and for your rights and obligations
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 # especially if you wish to modify or redistribute this code
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 # january 19 added random missingness inducer
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 # currently about 15M genos/minute without psyco, 30M/minute with
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 # so a billion genos should take about 40 minutes with psyco or 80 without...
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 # added mendel error generator jan 23 rml
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 import random,sys,time,os,string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 from optparse import OptionParser
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 defbasename="RgeneticsData"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 width = 500000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 ALLELES = ['1','2','3','4']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 prog = os.path.split(sys.argv[0])[-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 debug = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 """Natural-order sorting, supporting embedded numbers.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 # found at http://lists.canonical.org/pipermail/kragen-hacks/2005-October/000419.html
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 note test code there removed to conserve brain space
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 foo9bar2 < foo10bar2 < foo10bar10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 import random, re, sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 def natsort_key(item):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 chunks = re.split('(\d+(?:\.\d+)?)', item)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 for ii in range(len(chunks)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if chunks[ii] and chunks[ii][0] in '0123456789':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 if '.' in chunks[ii]: numtype = float
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 else: numtype = int
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 # wrap in tuple with '0' to explicitly specify numbers come first
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 chunks[ii] = (0, numtype(chunks[ii]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 chunks[ii] = (1, chunks[ii])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 return (chunks, item)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 def natsort(seq):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 "Sort a sequence of text strings in a reasonable order."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 alist = [item for item in seq]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 alist.sort(key=natsort_key)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 return alist
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 def makeUniformMAFdist(low=0.02, high=0.5):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 """Fake a non-uniform maf distribution to make the data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 more interesting. Provide uniform 0.02-0.5 distribution"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 MAFdistribution = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 for i in xrange(int(100*low),int(100*high)+1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 freq = i/100.0 # uniform
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 MAFdistribution.append(freq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 return MAFdistribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 def makeTriangularMAFdist(low=0.02, high=0.5, beta=5):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 """Fake a non-uniform maf distribution to make the data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 more interesting - more rare alleles """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 MAFdistribution = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 for i in xrange(int(100*low),int(100*high)+1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 freq = (51 - i)/100.0 # large numbers of small allele freqs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 for j in range(beta*i): # or i*i for crude exponential distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 MAFdistribution.append(freq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 return MAFdistribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 def makeFbathead(rslist=[], chromlist=[], poslist=[], width=100000):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 """header row
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 res = ['%s_%s_%s' % (chromlist[x], poslist[x], rslist[x]) for x in range(len(rslist))]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 return ' '.join(res)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 def makeMap( width=500000, MAFdistribution=[], useGP=False):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 """make snp allele and frequency tables for consistent generation"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 usegp = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 snpdb = 'snp126'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 hgdb = 'hg18'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 alleles = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 freqs = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 rslist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 chromlist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 poslist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 for snp in range(width):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 random.shuffle(ALLELES)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 alleles.append(ALLELES[0:2]) # need two DIFFERENT alleles!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 freqs.append(random.choice(MAFdistribution)) # more rare alleles
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if useGP:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 import MySQLdb
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 genome = MySQLdb.Connect('localhost', 'hg18', 'G3gn0m3')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 curs = genome.cursor() # use default cursor
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 if debug:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 print 'cannot connect to local copy of golden path'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 usegp = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 if usegp and useGP: # urrrgghh getting snps into chrom offset order is complicated....
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 curs.execute('use %s' % hgdb)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 print 'Collecting %d real rs numbers - this may take a while' % width
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 # get a random draw of enough reasonable (hapmap) snps with frequency data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 s = '''select distinct chrom,chromEnd, name from %s where avHet > 0 and chrom not like '%%random'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 group by name order by rand() limit %d''' % (snpdb,width)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 curs.execute(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 reslist = curs.fetchall()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 reslist = ['%s\t%09d\t%s' % (x[3:],y,z) for x,y,z in reslist] # get rid of chr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 reslist = natsort(reslist)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 for s in reslist:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 chrom,pos,rs = s.split('\t')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 rslist.append(rs)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 chromlist.append(chrom)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 poslist.append(pos)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 chrom = '1'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 for snp in range(width):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 pos = '%d' % (1000*snp)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 rs = 'rs00%d' % snp
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 rslist.append(rs)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 chromlist.append(chrom)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 poslist.append(pos)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 return alleles,freqs, rslist, chromlist, poslist
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 def writeMap(fprefix = '', fpath='./', rslist=[], chromlist=[], poslist=[], width = 500000):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 """make a faked plink compatible map file - fbat files
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 have the map written as a header line"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 outf = '%s.map'% (fprefix)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 outf = os.path.join(fpath,outf)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 amap = open(outf, 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 res = ['%s\t%s\t0\t%s' % (chromlist[x],rslist[x],poslist[x]) for x in range(len(rslist))]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 res.append('')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 amap.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 amap.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 def makeMissing(genos=[], missrate = 0.03, missval = '0'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 """impose some random missingness"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 nsnps = len(genos)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 for snp in range(nsnps): # ignore first 6 columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 if random.random() <= missrate:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 genos[snp] = '%s %s' % (missval,missval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 return genos
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 def makeTriomissing(genos=[], missrate = 0.03, missval = '0'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 """impose some random missingness on a trio - moth eaten like real data"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 for person in (0,1):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 nsnps = len(genos[person])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 for snp in range(nsnps):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 for person in [0,1,2]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 if random.random() <= missrate:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 genos[person][snp] = '%s %s' % (missval,missval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 return genos
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 def makeTriomendel(p1g=(0,0),p2g=(0,0), kiddip = (0,0)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 """impose some random mendels on a trio
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 there are 8 of the 9 mating types we can simulate reasonable errors for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 Note, since random mating dual het parents can produce any genotype we can't generate an interesting
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 error for them, so the overall mendel rate will be lower than mendrate, depending on
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 allele frequency..."""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 if p1g[0] <> p1g[1] and p2g[0] <> p2g[1]: # both parents het
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 return kiddip # cannot simulate a mendel error - anything is legal!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 elif (p1g[0] <> p1g[1]): # p1 is het parent so p2 must be hom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 if p2g[0] == 0: # - make child p2 opposite hom for error
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 kiddip = (1,1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 kiddip = (0,0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 elif (p2g[0] <> p2g[1]): # p2 is het parent so p1 must be hom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 if p1g[0] == 0: # - make child p1 opposite hom for error
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 kiddip = (1,1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 kiddip = (0,0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 elif (p1g[0] == p1g[1]): # p1 is hom parent and if we get here p2 must also be hom
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 if p1g[0] == p2g[0]: # both parents are same hom - make child either het or opposite hom for error
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 if random.random() <= 0.5:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 kiddip = (0,1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 if p1g[0] == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 kiddip = (1,1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 kiddip = (0,0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 else: # parents are opposite hom - return any hom as an error
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 if random.random() <= 0.5:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195 kiddip = (0,0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 kiddip = (1,1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 return kiddip
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203 def makeFam(width=100, freqs={}, alleles={}, trio=1, missrate=0.03, missval='0', mendrate=0.0):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 """this family is a simple trio, constructed by random mating two random genotypes
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 TODO: why not generate from chromosomes - eg hapmap
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 set each haplotype locus according to the conditional
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207 probability implied by the surrounding loci - eg use both neighboring pairs, triplets
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 and quads as observed in hapmap ceu"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 dadped = '%d 1 0 0 1 1 %s'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 mumped = '%d 2 0 0 2 1 %s' # a mother is a mum where I come from :)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 kidped = '%d 3 1 2 %d %d %s'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 family = [] # result accumulator
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 sex = random.choice((1,2)) # for the kid
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 affected = random.choice((1,2))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 genos = [[],[],[]] # dad, mum, kid - 0/1 for common,rare initially, then xform to alleles
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 # parent1...kidn lists of 0/1 for common,rare initially, then xformed to alleles
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217 for snp in xrange(width):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 f = freqs[snp]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 for i in range(2): # do dad and mum
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 p = random.random()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 a1 = a2 = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222 if p <= f: # a rare allele
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 a1 = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224 p = random.random()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225 if p <= f: # a rare allele
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226 a2 = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 if a1 > a2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228 a1,a2 = a2,a1 # so ordering consistent - 00,01,11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229 dip = (a1,a2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230 genos[i].append(dip) # tuples of 0,1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
231 a1 = random.choice(genos[0][snp]) # dad gamete
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
232 a2 = random.choice(genos[1][snp]) # mum gamete
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
233 if a1 > a2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
234 a1,a2 = a2,a1 # so ordering consistent - 00,01,11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
235 kiddip = (a1,a2) # NSFW mating!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
236 genos[2].append(kiddip)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
237 if mendrate > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
238 if random.random() <= mendrate:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
239 genos[2][snp] = makeTriomendel(genos[0][snp],genos[1][snp], kiddip)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
240 achoice = alleles[snp]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
241 for g in genos: # now convert to alleles using allele dict
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
242 a1 = achoice[g[snp][0]] # get allele letter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
243 a2 = achoice[g[snp][1]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
244 g[snp] = '%s %s' % (a1,a2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
245 if missrate > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
246 genos = makeTriomissing(genos=genos,missrate=missrate, missval=missval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
247 family.append(dadped % (trio,' '.join(genos[0]))) # create a row for each member of trio
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
248 family.append(mumped % (trio,' '.join(genos[1])))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
249 family.append(kidped % (trio,sex,affected,' '.join(genos[2])))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
250 return family
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
251
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
252 def makePerson(width=100, aff=1, freqs={}, alleles={}, id=1, missrate = 0.03, missval='0'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
253 """make an entire genotype vector for an independent subject"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
254 sex = random.choice((1,2))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
255 if not aff:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
256 aff = random.choice((1,2))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
257 genos = [] #0/1 for common,rare initially, then xform to alleles
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
258 family = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
259 personped = '%d 1 0 0 %d %d %s'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
260 poly = (0,1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
261 for snp in xrange(width):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
262 achoice = alleles[snp]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
263 f = freqs[snp]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
264 p = random.random()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
265 a1 = a2 = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
266 if p <= f: # a rare allele
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
267 a1 = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
268 p = random.random()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
269 if p <= f: # a rare allele
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
270 a2 = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
271 if a1 > a2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
272 a1,a2 = a2,a1 # so ordering consistent - 00,01,11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
273 a1 = achoice[a1] # get allele letter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
274 a2 = achoice[a2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
275 g = '%s %s' % (a1,a2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
276 genos.append(g)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
277 if missrate > 0.0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
278 genos = makeMissing(genos=genos,missrate=missrate, missval=missval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
279 family.append(personped % (id,sex,aff,' '.join(genos)))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
280 return family
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
281
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
282 def makeHapmap(fprefix= 'fakebigped',width=100, aff=[], freqs={},
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
283 alleles={}, nsubj = 2000, trios = True, mendrate=0.03, missrate = 0.03, missval='0'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
284 """ fake a hapmap file and a pedigree file for eg haploview
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
285 this is arranged as the transpose of a ped file - cols are subjects, rows are markers
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
286 so we need to generate differently since we can't do the transpose in ram reliably for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
287 a few billion genotypes...
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
288 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
289 outheadprefix = 'rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode %s'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
290 cfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
291 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:lsid:dcc.hapmap.org:Panel:CEPH-30-trios:1","QC+"]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
292 yfake5 = ["illumina","urn:LSID:illumina.hapmap.org:Protocol:Golden_Gate_1.0.0:1",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
293 "urn:LSID:illumina.hapmap.org:Assay:27741:1","urn:LSID:dcc.hapmap.org:Panel:Yoruba-30-trios:1","QC+"]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
294 sampids = ids
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
295 if trios:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
296 ts = '%d trios' % int(nsubj/3.)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
297 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
298 ts = '%d unrelated subjects' % nsubj
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
299 res = ['#%s fake hapmap file %d snps and %s, faked by %s' % (timenow(), width, ts, prog),]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
300 res.append('# ross lazarus me fecit')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
301 res.append(outheadprefix % ' '.join(sampids)) # make a header compatible with hapmap extracts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
302 outf = open('%s.hmap' % (fprefix), 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
303 started = time.time()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
304 if trios:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
305 ntrios = int(nsubj/3.)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
306 for n in ntrios: # each is a dict
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
307 row = copy.copy(cfake5) # get first fields
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
308 row = map(str,row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
309 if race == "YRI":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
310 row += yfake5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
311 elif race == 'CEU':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
312 row += cfake5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
313 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
314 row += ['NA' for x in range(5)] # 5 dummy fields = center protLSID assayLSID panelLSID QCcode
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
315 row += [''.join(sorted(line[x])) for x in sampids] # the genotypes in header (sorted) sample id order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
316 res.append(' '.join(row))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
317 res.append('')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
318 outfname = '%s_%s_%s_%dkb.geno' % (gene,probeid,race,2*flank/1000)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
319 f = file(outfname,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
320 f.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
321 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
322 print '### %s: Wrote %d lines to %s' % (timenow(), len(res),outfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
323
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
324
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
325 def makePed(fprefix= 'fakebigped', fpath='./',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
326 width=500000, nsubj=2000, MAFdistribution=[],alleles={},
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
327 freqs={}, fbatstyle=True, mendrate = 0.0, missrate = 0.03, missval='0',fbathead=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
328 """fake trios with mendel consistent random mating genotypes in offspring
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
329 with consistent alleles and MAFs for the sample"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
330 res = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
331 if fbatstyle: # add a header row with the marker names
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
332 res.append(fbathead) # header row for fbat
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
333 outfname = '%s.ped'% (fprefix)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
334 outfname = os.path.join(fpath,outfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
335 outf = open(outfname,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
336 ntrios = int(nsubj/3.)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
337 outf = open(outfile, 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
338 started = time.time()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
339 for trio in xrange(ntrios):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
340 family = makeFam(width=width, freqs=freqs, alleles=alleles, trio=trio,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
341 missrate = missrate, mendrate=mendrate, missval=missval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
342 res += family
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
343 if (trio + 1) % 10 == 0: # write out to keep ram requirements reasonable
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
344 if (trio + 1) % 50 == 0: # show progress
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
345 dur = time.time() - started
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
346 if dur == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
347 dur = 1.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
348 print 'Trio: %d, %4.1f genos/sec at %6.1f sec' % (trio + 1, width*trio*3/dur, dur)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
349 outf.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
350 outf.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
351 res = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
352 if len(res) > 0: # some left
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
353 outf.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
354 outf.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
355 outf.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
356 if debug:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
357 print '##makeped : %6.1f seconds total runtime' % (time.time() - started)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
358
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
359 def makeIndep(fprefix = 'fakebigped', fpath='./',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
360 width=500000, Nunaff=1000, Naff=1000, MAFdistribution=[],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
361 alleles={}, freqs={}, fbatstyle=True, missrate = 0.03, missval='0',fbathead=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
362 """fake a random sample from a random mating sample
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
363 with consistent alleles and MAFs"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
364 res = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
365 Ntot = Nunaff + Naff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
366 status = [1,]*Nunaff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
367 status += [2,]*Nunaff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
368 outf = '%s.ped' % (fprefix)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
369 outf = os.path.join(fpath,outf)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
370 outf = open(outf, 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
371 started = time.time()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
372 #sample = personMaker(width=width, affs=status, freqs=freqs, alleles=alleles, Ntomake=Ntot)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
373 if fbatstyle: # add a header row with the marker names
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
374 res.append(fbathead) # header row for fbat
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
375 for id in xrange(Ntot):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
376 if id < Nunaff:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
377 aff = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
378 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
379 aff = 2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
380 family = makePerson(width=width, aff=aff, freqs=freqs, alleles=alleles, id=id+1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
381 res += family
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
382 if (id % 50 == 0): # write out to keep ram requirements reasonable
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
383 if (id % 200 == 0): # show progress
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
384 dur = time.time() - started
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
385 if dur == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
386 dur = 1.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
387 print 'Id: %d, %4.1f genos/sec at %6.1f sec' % (id, width*id/dur, dur)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
388 outf.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
389 outf.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
390 res = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
391 if len(res) > 0: # some left
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
392 outf.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
393 outf.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
394 outf.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
395 print '## makeindep: %6.1f seconds total runtime' % (time.time() - started)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
396
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
397 u = """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
398 Generate either trios or independent subjects with a prespecified
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
399 number of random alleles and a uniform or triangular MAF distribution for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
400 stress testing. No LD is simulated - alleles are random. Offspring for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
401 trios are generated by random mating the random parental alleles so there are
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
402 no Mendelian errors unless the -M option is used. Mendelian errors are generated
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
403 randomly according to the possible errors given the parental mating type although
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
404 this is fresh code and not guaranteed to work quite right yet - comments welcomed
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
405
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
406 Enquiries to ross.lazarus@gmail.com
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
407
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
408 eg to generate 700 trios with 500k snps, use:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
409 fakebigped.py -n 2100 -s 500000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
410 or to generate 500 independent cases and 500 controls with 100k snps and 0.02 missingness (MCAR), use:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
411 fakebigped.py -c 500 -n 1000 -s 100000 -m 0.02
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
412
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
413 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
414 will make fbat compatible myfake.ped with 100k markers in
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
415 666 trios (total close to 2000 subjects), a uniform MAF distribution and about 5% MCAR missing
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
416
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
417 fakebigped.py -o myfake -m 0.05 -s 100000 -n 2000 -M 0.05
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
418 will make fbat compatible myfake.ped with 100k markers in
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
419 666 trios (total close to 2000 subjects), a uniform MAF distribution,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
420 about 5% Mendelian errors and about 5% MCAR missing
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
421
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
422
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
423 fakebigped.py -o myfakecc -m 0.05 -s 100000 -n 2000 -c 1000 -l
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
424 will make plink compatible myfakecc.ped and myfakecc.map (that's what the -l option does),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
425 with 100k markers in 1000 cases and 1000 controls (affection status 2 and 1 respectively),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
426 a triangular MAF distribution (more rare alleles) and about 5% MCAR missing
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
427
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
428 You should see about 1/4 million genotypes/second so about an hour for a
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
429 500k snps in 2k subjects and about a 4GB ped file - these are BIG!!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
430
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
431 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
432
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
433 import sys, os, glob
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
434
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
435 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
436 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
437 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
438 <head>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
439 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
440 <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
441 <title></title>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
442 <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
443 </head>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
444 <body>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
445 <div class="document">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
446 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
447
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
448
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
449 def doImport(outfile=None,outpath=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
450 """ import into one of the new html composite data types for Rgenetics
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
451 Dan Blankenberg with mods by Ross Lazarus
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
452 October 2007
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
453 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
454 flist = glob.glob(os.path.join(outpath,'*'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
455 outf = open(outfile,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
456 outf.write(galhtmlprefix % prog)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
457 for i, data in enumerate( flist ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
458 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
459 outf.write('<br><h3>This is simulated null genotype data generated by Rgenetics!</h3>')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
460 outf.write('%s called with command line:<br><pre>' % prog)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
461 outf.write(' '.join(sys.argv))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
462 outf.write('\n</pre>\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
463 outf.write("</div></body></html>")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
464 outf.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
465
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
466
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
467
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
468 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
469 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
470 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
471 parser = OptionParser(usage=u, version="%prog 0.01")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
472 a = parser.add_option
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
473 a("-n","--nsubjects",type="int",dest="Ntot",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
474 help="nsubj: total number of subjects",default=2000)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
475 a("-t","--title",dest="title",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
476 help="title: file basename for outputs",default='fakeped')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
477 a("-c","--cases",type="int",dest="Naff",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
478 help="number of cases: independent subjects with status set to 2 (ie cases). If not set, NTOT/3 trios will be generated", default = 0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
479 a("-s","--snps",dest="width",type="int",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
480 help="snps: total number of snps per subject", default=1000)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
481 a("-d","--distribution",dest="MAFdist",default="Uniform",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
482 help="MAF distribution - default is Uniform, can be Triangular")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
483 a("-o","--outf",dest="outf",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
484 help="Output file", default = 'fakeped')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
485 a("-p","--outpath",dest="outpath",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
486 help="Path for output files", default = './')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
487 a("-l","--pLink",dest="outstyle", default='L',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
488 help="Ped files as for Plink - no header, separate Map file - default is Plink style")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
489 a("-w","--loWmaf", type="float", dest="lowmaf", default=0.01, help="Lower limit for SNP MAF (minor allele freq)")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
490 a("-m","--missing",dest="missrate",type="float",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
491 help="missing: probability of missing MCAR - default 0.0", default=0.0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
492 a("-v","--valmiss",dest="missval",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
493 help="missing character: Missing allele code - usually 0 or N - default 0", default="0")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
494 a("-M","--Mendelrate",dest="mendrate",type="float",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
495 help="Mendelian error rate: probability of a mendel error per trio, default=0.0", default=0.0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
496 a("-H","--noHGRS",dest="useHG",type="int",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
497 help="Use local copy of UCSC snp126 database to generate real rs numbers", default=True)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
498 (options,args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
499 low = options.lowmaf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
500 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
501 os.makedirs(options.outpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
502 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
503 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
504 if options.MAFdist.upper() == 'U':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
505 mafDist = makeUniformMAFdist(low=low, high=0.5)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
506 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
507 mafDist = makeTriangularMAFdist(low=low, high=0.5, beta=5)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
508 alleles,freqs, rslist, chromlist, poslist = makeMap(width=int(options.width),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
509 MAFdistribution=mafDist, useGP=False)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
510 fbathead = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
511 s = string.whitespace+string.punctuation
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
512 trantab = string.maketrans(s,'_'*len(s))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
513 title = string.translate(options.title,trantab)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
514
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
515 if options.outstyle == 'F':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
516 fbatstyle = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
517 fbathead = makeFbathead(rslist=rslist, chromlist=chromlist, poslist=poslist, width=options.width)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
518 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
519 fbatstyle = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
520 writeMap(fprefix=defbasename, rslist=rslist, fpath=options.outpath,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
521 chromlist=chromlist, poslist=poslist, width=options.width)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
522 if options.Naff > 0: # make case control data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
523 makeIndep(fprefix = defbasename, fpath=options.outpath,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
524 width=options.width, Nunaff=options.Ntot-options.Naff,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
525 Naff=options.Naff, MAFdistribution=mafDist,alleles=alleles, freqs=freqs,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
526 fbatstyle=fbatstyle, missrate=options.missrate, missval=options.missval,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
527 fbathead=fbathead)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
528 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
529 makePed(fprefix=defbasename, fpath=options.fpath,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
530 width=options.width, MAFdistribution=mafDist, nsubj=options.Ntot,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
531 alleles=alleles, freqs=freqs, fbatstyle=fbatstyle, missrate=options.missrate,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
532 mendrate=options.mendrate, missval=options.missval,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
533 fbathead=fbathead)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
534 doImport(outfile=options.outf,outpath=options.outpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
535
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
536
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
537