comparison multiple_to_gd_genotype.py @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents
children 184d14e4270d
comparison
equal deleted inserted replaced
26:91e835060ad2 27:8997f2ca8c7a
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # multiple_to_gd_genotype.py
5 #
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the pathways of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 # MA 02110-1301, USA.
22
23 import argparse
24 import base64
25 import json
26 import os
27 import sys
28
29 def fold_line(line, maxlen=200, prefix="#"):
30 """
31 format hader to a 200 char max.
32 """
33 line_len = len(line)
34 prefix_len = len(prefix)
35
36 if line_len + prefix_len <= maxlen:
37 return '%s%s' % (prefix, line)
38
39 lines = []
40 start_idx = 0
41 offset = 0
42
43 while start_idx < line_len - 1:
44 last_idx = start_idx
45 idx = start_idx
46 start = start_idx
47
48 while idx != -1 and idx < maxlen + prefix_len + offset - 1:
49 last_idx = idx
50 idx = line.find(',', start)
51 start = idx+1
52
53 if idx == -1:
54 lines.append('%s%s' % (prefix, line[start_idx:]))
55 break
56
57 lines.append('%s%s' % (prefix, line[start_idx:last_idx+1]))
58 start_idx = last_idx + 1
59 offset = last_idx + 1
60
61 return '\n'.join(lines)
62
63
64 def formthdr(lPops,dbkey,species):
65 """
66 returns a formated metadata for a gd_genotype file from a paramSet
67 dictionary and a list (lPops) of individuals
68 """
69 clmnsVals=', '.join(['"%sG"'%(x+1) for x in range(len(lPops))])#"'1G', '2G', ..."
70 indvdls='], ['.join(['"%s", %s'%(lPops[x],x+5) for x in range(len(lPops))])#['DU23M01 Duroc domestic breed Europe', 5], ['DU23M02 Duroc domestic breed Europe', 6], ...
71 obj='{"rPos": 2, "column_names": ["chr", "pos", "A", "B", %s], "scaffold": 1, "pos": 2, "dbkey": "%s", "individuals": [[%s]], "ref": 1, "species": "%s"}'%(clmnsVals,dbkey,indvdls,species)
72 json_value = json.loads(obj)
73 hdr = fold_line(json.dumps(json_value, separators=(',',':'), sort_keys=True))
74 #~
75 return hdr
76
77
78 def formthdr_gdsnp(lPops,dbkey,species):
79 """
80 returns a formated metadata for a gd_genotype file from a paramSet
81 dictionary and a list (lPops) of individuals
82 """
83 clmnsVals=', '.join(['"%sA","%sB","%sG","%sQ"'%((x+1),(x+1),(x+1),(x+1)) for x in range(len(lPops))])#"'1G', '2G', ..."
84 indvdls='], ['.join(['"%s", %s'%(lPops[x],(((x+1)*4)+2)) for x in range(len(lPops))])#['DU23M01 Duroc domestic breed Europe', 5], ['DU23M02 Duroc domestic breed Europe', 9], ...
85 obj='{"rPos": 2, "column_names": ["chr", "pos", "A", "B", "Q", %s], "scaffold": 1, "pos": 2, "dbkey": "%s", "individuals": [[%s]], "ref": 1, "species": "%s"}'%(clmnsVals,dbkey,indvdls,species)
86 json_value = json.loads(obj)
87 hdr = fold_line(json.dumps(json_value, separators=(',',':'), sort_keys=True))
88 #~
89 return hdr
90
91
92 def selAnc(SNPs):
93 """
94 returns the ancestral and derived snps, and an gd_genotype-encoded
95 list of SNPs/nucleotides
96 """
97 dIUPAC={'AC':'M','AG':'R','AT':'W','CG':'S','CT':'Y','GT':'K'}#'N':'N','A':'A','T':'T','C':'C','G':'G',
98 dNtsCnts={}
99 for eSNP in SNPs:
100 if eSNP!='N':
101 try:
102 dNtsCnts[eSNP]+=1
103 except:
104 dNtsCnts[eSNP]=1
105 #~
106 nAlleles=len(dNtsCnts.keys())
107 assert nAlleles<=3
108 if nAlleles==3:
109 nonCons=[x for x in dNtsCnts.keys() if x in set(['A','T','C','G'])]
110 cons=[x for x in dNtsCnts.keys() if x not in nonCons]
111 assert len(nonCons)==2 and len(cons)==1 and dIUPAC[''.join(sorted(nonCons))]==''.join(cons)
112 ancd=nonCons[0]
113 dervd=nonCons[1]
114 if dNtsCnts[dervd]>dNtsCnts[ancd]:
115 ancd,dervd=dervd,ancd
116 elif nAlleles==2:
117 cons=set(dNtsCnts.keys()).intersection(set(dIUPAC.values()))
118 assert len(cons)<=1
119 if len(cons)==0:
120 ancd,dervd=dNtsCnts.keys()
121 if dNtsCnts[dervd]>dNtsCnts[ancd]:
122 ancd,dervd=dervd,ancd
123 else:
124 dervd=''.join(cons)
125 ancd=''.join([x for x in dNtsCnts.keys() if x!=dervd])
126 else:#<=1
127 ancd=''.join(dNtsCnts.keys())
128 dervd='N'
129 #~
130 lSNPsEncoded=[]
131 for eSNP in SNPs:
132 if eSNP=='N':
133 lSNPsEncoded.append('-1')
134 elif eSNP==ancd:
135 lSNPsEncoded.append('2')
136 elif eSNP==dervd:
137 lSNPsEncoded.append('0')
138 else:
139 lSNPsEncoded.append('1')
140 #~
141 return ancd,dervd,lSNPsEncoded
142
143
144
145 def from_csv(in_csv,outgd_gentp,dbkey,species):
146 """
147 returns a gd_genotype file format from csv file (saved in excel).
148 The input must consist of a set of rows with columns splited by a
149 comma. The first row must contain the names of the individuals. For
150 the other rows, the first of column must contain the chromosome and
151 position of the snp. The other columns must contain any kind of
152 fstat or genepop allele valid encoding, with at most 2 alleles. Also,
153 the user may input IUPAC valid nucleotide symbols. The program will
154 assume that the mosts common is the ancestral.
155 ------------- The file starts here ----------------
156 ,Ind_1,Ind_2,Ind_3,Ind_4,Ind_5,Ind_6,Ind_7
157 chr1 12334123,A,T,T,A,T,T,W
158 chr2 1232654,C,G,G,C,N,S,N
159 chr3 3356367,T,G,G,G,G,K,K
160 chr4 95673,A,C,C,A,C,M,M
161 chr5 45896,T,C,Y,Y,Y,C,T
162 ...
163 or
164 ...
165 chr6 2354,22,11,21,00,12,12,22
166 ------------- The file ends here -------------------
167 """
168 infoLn=True
169 slf=open(outgd_gentp,'w')
170 for echl in open(in_csv,'r'):
171 if echl.strip():
172 if infoLn:
173 lPops=[x for x in echl.splitlines()[0].split(',') if x.strip()]
174 hdr=formthdr(lPops,dbkey,species)
175 slf.write('%s\n'%hdr)
176 infoLn=False
177 else:
178 lsplitd=echl.splitlines()[0].split(',')
179 lchrpox,SNPs=lsplitd[0],[x for x in lsplitd[1:] if x.strip()]
180 lchrpox='\t'.join(lchrpox.strip().split())
181 if SNPs[0].isdigit():
182 lSNPsEncoded=[]
183 for snp in SNPs:
184 cnt=0
185 for ep in snp:
186 ep=int(ep)
187 assert ep<=2
188 cnt+=ep
189 cnt=4-cnt
190 if cnt==4:
191 frmtdSNP='-1'
192 else:
193 frmtdSNP=str(cnt)
194 lSNPsEncoded.append(frmtdSNP)
195 ancd,dervd='N','N'
196 else:
197 ancd,dervd,lSNPsEncoded=selAnc(SNPs)
198 outfrmtdDat='%s\t%s\t%s\t%s'%(lchrpox,ancd,dervd,'\t'.join(lSNPsEncoded))
199 slf.write('%s\n'%outfrmtdDat)
200 #~
201 slf.close()
202 return 0
203
204
205 def from_fstat(in_fstat,outgd_gentp,dbkey,species):
206 """
207 returns a gd_genotype file format from fstat file. Ignores pops
208 structures and alleles other than the combinations of the alleles
209 encoded by 0, 1, and 2 up to 9 digits. The first line contains 4
210 numbers separated by any number of spaces: number of samples, np,
211 number of loci, nl, highest number used to label an allele, nu
212 (?=2), and 1 if the code for alleles is a one digit number (1-2), a
213 2 if code for alleles is a 2 digit number (01-02) or a 3 if code for
214 alleles is a 3 digit number (001-002). Followed by nl lines: each
215 containing the name of a locus, in the order they will appear in the
216 rest of the file On line nl+2, a series of numbers as follow: 1 0102
217 0101 0101 0201 0 0101 first number: identifies the sample to which
218 the individual belongs second: the genotype of the individual at the
219 first locus, coded with a 2 digits number for each allele third: the
220 genotype at the second locus, until locus nl is entered. Missing
221 genotypes are encoded with 0 (0001 or 0100 are not a valid format,
222 so both alleles at a locus have to be known, otherwise, the genotype
223 is considered as missing) no empty lines are needed between samples
224 number of spaces between genotypes can be anything. Numbering of
225 samples need not be sequential the number of samples np needs to be
226 the same as the largest sample identifier. Samples need not to be
227 ordered nu needs to be equal to the largest code given to an allele
228 (even if there are less than nu alleles). Ancestral is taken as 01,
229 derived 02. In all cases ancestral and derived SNPs are returned as
230 N.
231 ------------- The file starts here ----------------
232 7 5 2 1
233 chr1 12334123
234 chr2 1232654
235 chr3 3356367
236 chr4 95673
237 chr5 45896
238 Ind_1 22 22 21 11 22
239 Ind_2 22 22 11 12 22
240 Ind_3 22 11 22 21 22
241 Ind_4 22 21 22 21 22
242 Ind_5 22 22 22 21 22
243 Ind_6 22 22 22 22 22
244 Ind_7 22 22 21 21 22
245 ------------- The file ends here -------------------
246 """
247 dChrPos,lPops,lChrPos,addPop={},[],[],False
248 clines=-1
249 for echl in open(in_fstat,'r'):
250 clines+=1
251 if echl.strip():
252 if clines==0:
253 nSmpls,nLoci,nUsed,nDigs=[x for x in echl.splitlines()[0].split() if x.strip()]
254 nLoci=int(nLoci)
255 elif clines<=nLoci:
256 addPop=True
257 lchrpox='\t'.join(echl.strip().split())
258 lChrPos.append(lchrpox)
259 elif addPop:
260 lsplitd=echl.splitlines()[0].split()
261 pop,SNPs=lsplitd[0],[x for x in lsplitd[1:] if x.strip()]
262 pop=pop.strip()
263 for x in range(nLoci):
264 snp=SNPs[x]
265 cnt=0
266 for ep in snp:
267 ep=int(ep)
268 assert ep<=2
269 cnt+=ep
270 cnt=4-cnt
271 if cnt==4:
272 frmtdSNP='-1'
273 else:
274 frmtdSNP=str(cnt)
275 try:
276 dChrPos[lChrPos[x]].append(frmtdSNP)
277 except:
278 dChrPos[lChrPos[x]]=[frmtdSNP]
279 #~
280 lPops.append(pop)
281 #~
282 hdr=formthdr(lPops,dbkey,species)
283 outfrmtdDat=['%s\t%s\t%s\t%s'%(x,'N','N','\t'.join(dChrPos[x])) for x in lChrPos]
284 #~
285 slf=open(outgd_gentp,'w')
286 slf.write('\n'.join([hdr,'\n'.join(outfrmtdDat)]))
287 slf.close()
288 return 0
289
290
291 def from_genepop(in_genepop,outgd_gentp,dbkey,species):
292 """
293 returns a gd_genotype file format from genepop file . Ignores pops
294 structures and alleles other than the combinations of the alleles
295 encoded by 00, 01, and 02. The second line must contain the chromosome
296 and position of the SNPs separated by an space or a tab. Each loci
297 should be separated by a comma. Alternatively, they may be given one
298 per line. They may be given one per line, or on the same line but
299 separated by commas. The name of individuals are defined as
300 everything on the left of a comma, and their genotypes following the
301 same order of the SNPs in the second line. Ancestral is taken as 01,
302 derived 02. In all cases ancestral and derived SNPs are returned as N
303 ------------- The file starts here ----------------
304 Microsat on Chiracus radioactivus, a pest species
305 chr1 23123, chr2 90394, chr3 90909, chr3 910909, chr4 10909
306 POP
307 AA2, 0201 0111 0102 0000 0101
308 AA1, 0201 0201 0202 0000 0101
309 A10, 0201 0201 0101 0000 0101
310 A11, 0201 0202 0102 0000 0102
311 A12, 0202 0201 0101 0000 0101
312 A11, 0101 0101 0101 0000 0101
313 A12, 0202 0201 0201 0000 0101
314 A11, 0201 0201 0101 0000 0101
315 Pop
316 AF1, 0000 0000 0000 0000 0101
317 AF2, 0201 0101 0102 0000 0101
318 AF3, 0202 0201 0202 0000 0101
319 AF4, 0201 0101 0000 0000 0101
320 AF5, 0201 0101 0202 0000 0101
321 AF6, 0101 0101 0102 0000 0101
322 AF7, 0201 0100 0000 0000 0101
323 AF8, 0101 0100 0000 0000 0201
324 AF9, 0201 0200 0000 0000 0101
325 AF10, 0101 0202 0202 0000 0101
326 pop
327 C211, 0101 0202 0202 0000 0101
328 C211, 0101 0101 0202 0000 0101
329 C21E, 0101 0102 0202 0000 0101
330 C21B, 0101 0101 0102 0000 0201
331 C21C, 0201 0101 0202 0000 0101
332 C21D, 0201 0101 0202 0000 0201
333 ------------- The file ends here -------------------
334 """
335 dChrPos,lPops,lChrPos,addPop,addDat={},[],[],False,True
336 clines=-1
337 for echl in open(in_genepop,'r'):
338 clines+=1
339 if echl.strip():
340 if echl.strip() in set(['pop','POP','Pop']):
341 addDat,addPop=False,True
342 elif addDat and clines>0:
343 lchrpox=['\t'.join(x.split()) for x in echl.split(',') if x.strip()]
344 lChrPos.extend(lchrpox)
345 elif addPop:
346 pop,SNPs=echl.splitlines()[0].split(',')
347 pop=pop.strip()
348 SNPs=[x for x in SNPs.split() if x.strip()]
349 for x in range(len(SNPs)):
350 snp=SNPs[x]
351 cnt=0
352 for ep in snp:
353 ep=int(ep)
354 assert ep<=2
355 cnt+=ep
356 cnt=4-cnt
357 if cnt==4:
358 frmtdSNP='-1'
359 else:
360 frmtdSNP=str(cnt)
361 try:
362 dChrPos[lChrPos[x]].append(frmtdSNP)
363 except:
364 dChrPos[lChrPos[x]]=[frmtdSNP]
365 #~
366 lPops.append(pop)
367 #~
368 hdr=formthdr(lPops,dbkey,species)
369 outfrmtdDat=['%s\t%s\t%s\t%s'%(x,'N','N','\t'.join(dChrPos[x])) for x in lChrPos]
370 #~
371 slf=open(outgd_gentp,'w')
372 slf.write('\n'.join([hdr,'\n'.join(outfrmtdDat)]))
373 slf.close()
374 return 0
375
376
377 def from_vcf(inVCF,outgd_gentp,dbkey,species):
378 """
379 returns a gd_genotype file format from vcf a file
380 """
381 slf=open(outgd_gentp,'w')
382 paramSet,addPop,adinfo=False,False,False
383 lPops=[]
384 for echl in open(inVCF,'r'):
385 if echl.strip():
386 if not paramSet:
387 if echl.find('##')==0:
388 pass
389 elif echl.find('#')==0:
390 paramSet={}
391 all_params=[x for x in echl.split() if x.strip()]
392 clmn=-1
393 for eparam in all_params:
394 clmn+=1
395 if eparam=='#CHROM':
396 paramSet['chr']=clmn
397 elif eparam=='POS':
398 paramSet['pos']=clmn
399 elif eparam=='REF':
400 paramSet['A']=clmn
401 elif eparam=='ALT':
402 paramSet['B']=clmn
403 elif eparam=='QUAL':
404 paramSet['qual']=clmn
405 elif eparam=='FORMAT':
406 paramSet['frmt']=clmn
407 addPop=True
408 elif addPop:
409 lPops.append(eparam)
410 paramSet[eparam]=clmn
411 if paramSet:
412 hdr=formthdr(lPops,dbkey,species)
413 slf.write('%s\n'%hdr)
414 else:
415 all_vals=[x for x in echl.split() if x.strip()]
416 frmt=[x for x in all_vals[paramSet['frmt']].split(':') if x.strip()]
417 clmn=-1
418 gtclmn,adclmn,qulclmn=False,False,False
419 for p in frmt:
420 clmn+=1
421 if p=='GT':
422 gtclmn=clmn
423 elif p=='AD':
424 adclmn=clmn
425 adinfo=True
426 elif p=='GQ':
427 qulclmn=clmn
428 #~
429 if adinfo:
430 outptInfo=[all_vals[paramSet['chr']],all_vals[paramSet['pos']],all_vals[paramSet['A']],all_vals[paramSet['B']],all_vals[paramSet['qual']]]
431 for ePop in lPops:
432 gntyp=all_vals[paramSet[ePop]].replace('|','/').split(':')[gtclmn].split('/')
433 encdGntyp,adA,adB,qual='-1','0','0','-1'
434 #~
435 if set(gntyp)!=set(['.']):
436 encdGntyp=2-sum([int(x) for x in gntyp])
437 if adclmn:
438 try:
439 adA,adB=all_vals[paramSet[ePop]].split(':')[adclmn].split(',')
440 except:
441 pass
442 if qulclmn:
443 try:
444 qual=all_vals[paramSet[ePop]].split(':')[qulclmn]
445 except:
446 pass
447 outptInfo.extend([adA,adB,str(encdGntyp),qual])
448 slf.write('%s\n'%'\t'.join(outptInfo))
449 #~
450 else:
451 outptInfo=[all_vals[paramSet['chr']],all_vals[paramSet['pos']],all_vals[paramSet['A']],all_vals[paramSet['B']]]
452 for ePop in lPops:
453 gntyp=all_vals[paramSet[ePop]].replace('|','/').split(':')[gtclmn].split('/')
454 try:
455 encdGntyp=2-sum([int(x) for x in gntyp])
456 except:
457 encdGntyp=-1
458 outptInfo.append(str(encdGntyp))
459 #~
460 slf.write('%s\n'%'\t'.join(outptInfo))
461 slf.close()
462 #~
463 if adinfo:
464 hdr=formthdr_gdsnp(lPops,dbkey,species)
465 slf=open('%stmp'%outgd_gentp,'w')
466 slf.write('%s\n'%hdr)
467 appnd=False
468 for echl in open(outgd_gentp,'r'):
469 if appnd:
470 slf.write(echl)
471 else:
472 if echl[0]!='#':
473 appnd=True
474 slf.close()
475 #~
476 os.system('mv %stmp %s'%(outgd_gentp,outgd_gentp))
477 #~
478 return 0
479
480
481 def main():
482 #~
483 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
484 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in VCF format.',required=True)
485 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in gd_genotype format.',required=True)
486 parser.add_argument('--dbkey',metavar='string',type=str,help='the input reference species dbkey (i.e. susScr3).',required=True)
487 parser.add_argument('--species',metavar='string',type=str,help='the input reference species name (i.e. int).',required=True)
488 parser.add_argument('--format',metavar='string',type=str,help='format of the input file (i.e. vcf, genepop, fstat, csv).',required=True)
489
490 args = parser.parse_args()
491
492 infile = args.input
493 outgd_gentp = args.output
494 dbkey = args.dbkey
495 species = base64.b64decode(args.species)
496 frmat = args.format
497
498 #~
499 if frmat=='vcf':
500 from_vcf(infile,outgd_gentp,dbkey,species)
501 elif frmat=='genepop':
502 from_genepop(infile,outgd_gentp,dbkey,species)
503 elif frmat=='fstat':
504 from_fstat(infile,outgd_gentp,dbkey,species)
505 elif frmat=='csv':
506 from_csv(infile,outgd_gentp,dbkey,species)
507
508 #~
509 return 0
510
511 if __name__ == '__main__':
512 main()
513