comparison make_phylip.py @ 35:ea52b23f1141

Bug fixes for Draw variants, Phylip, and gd_d_tools
author Richard Burhans <burhans@bx.psu.edu>
date Wed, 20 Nov 2013 13:46:10 -0500
parents a631c2f6d913
children 51cd0307fb70
comparison
equal deleted inserted replaced
34:f739a296a339 35:ea52b23f1141
22 22
23 import argparse 23 import argparse
24 import errno 24 import errno
25 import os 25 import os
26 import shutil 26 import shutil
27 from Population import Population
27 28
28 def mkdir_p(path): 29 def mkdir_p(path):
29 try: 30 try:
30 os.makedirs(path) 31 os.makedirs(path)
31 except OSError, e: 32 except OSError, e:
380 outFastaf.write('\n'.join(seqPMLformat)) 381 outFastaf.write('\n'.join(seqPMLformat))
381 outFastaf.close() 382 outFastaf.close()
382 #~ 383 #~
383 return 0 384 return 0
384 385
386 def pos_dict(gd_indivs_file, input_type):
387 rv = {}
388
389 p = Population()
390 p.from_population_file(gd_indivs_file)
391
392 for tag in p.tag_list():
393 column, name = tag.split(':')
394 column = int(column) - 1
395
396 if input_type == 'gd_genotype':
397 column -= 2
398
399 rv[name] = column
400
401 return rv
402
385 def main(): 403 def main():
386 #~ 404 #~
387 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 405 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0
388 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') 406 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
389 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) 407 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True)
408 parser.add_argument('--input_type',metavar='input type',type=str,help='the input file type (gd_snp or gd_genotype)',required=True)
390 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) 409 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True)
391 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) 410 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True)
392 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) 411 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True)
393 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True) 412 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True)
394 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True) 413 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True)
395 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True) 414 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True)
396 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True) 415 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True)
397 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) 416 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True)
398 #~ 417 #~
399 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) 418 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False)
419 parser.add_argument('--inputCover_type',metavar='input cover type',type=str,help='the cover input file type (gd_snp or gd_genotype)',required=False,default=False)
400 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) 420 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False)
401 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) 421 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False)
402 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) 422 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False)
403 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False) 423 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False)
404 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False) 424 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False)
414 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False) 434 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False)
415 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False) 435 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False)
416 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) 436 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False)
417 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) 437 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False)
418 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) 438 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False)
419 439
420 args = parser.parse_args() 440 args = parser.parse_args()
421 441
422 inSNPf = args.input 442 inSNPf = args.input
443 inSNPf_type = args.input_type
423 outfile = args.output 444 outfile = args.output
424 outfile_id = args.output_id 445 outfile_id = args.output_id
425 outFastaFold = './out' 446 outFastaFold = './out'
426 files_dir = args.output_dir 447 files_dir = args.output_dir
427 gd_indivs = args.gd_indivs 448 gd_indivs = args.gd_indivs
442 cdsEndClmn = args.cdsEndClmn#coding sequence end column 463 cdsEndClmn = args.cdsEndClmn#coding sequence end column
443 startExsClmn = args.startExsClmn#exons start column 464 startExsClmn = args.startExsClmn#exons start column
444 endExsClmn = args.endExsClmn#exons end column 465 endExsClmn = args.endExsClmn#exons end column
445 466
446 inputCover = args.inputCover 467 inputCover = args.inputCover
468 inputCover_type = args.inputCover_type
447 gd_indivs_cover = args.gd_indivs_cover 469 gd_indivs_cover = args.gd_indivs_cover
448 cvrgTreshold = args.cvrgTreshold 470 cvrgTreshold = args.cvrgTreshold
449 pxchrxCov = args.chrClmnCvrg 471 pxchrxCov = args.chrClmnCvrg
450 pxposCov = args.posClmnCvrg 472 pxposCov = args.posClmnCvrg
451 pxntACov = args.refClmnCvrg 473 pxntACov = args.refClmnCvrg
452 pxntBCov = args.altrClmnCvrg 474 pxntBCov = args.altrClmnCvrg
453 indvlsPrctTrshld = args.indvlsPrctTrshld 475 indvlsPrctTrshld = args.indvlsPrctTrshld
454 476
477
455 #print inputCover, gd_indivs_cover, cvrgTreshold 478 #print inputCover, gd_indivs_cover, cvrgTreshold
456 479
457 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) 480 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile))
458 481
459 #~ 482 dPopsinSNPfPos = pos_dict(gd_indivs, inSNPf_type)
460 dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()])
461 #~ dPopsinSNPfPos.update({'ref':False}) 483 #~ dPopsinSNPfPos.update({'ref':False})
462 #~ 484 #~
463 sPopsIntrst=set(dPopsinSNPfPos.keys()) 485 sPopsIntrst=set(dPopsinSNPfPos.keys())
464 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...' 486 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...'
465 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile) 487 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile)
466 #~ 488 #~
467 if inputCover and gd_indivs_cover and cvrgTreshold>=0: 489 if inputCover and gd_indivs_cover and cvrgTreshold>=0:
468 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) 490 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()])
469 dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()])) 491 dPopsinSNPfPos_cover.update(pos_dict(gd_indivs_cover, inputCover_type))
470 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) 492 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld)
471 rvrse=False 493 rvrse=False
472 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} 494 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)}
473 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}} 495 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}}
474 #~ 496 #~
480 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...' 502 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...'
481 #~ 503 #~
482 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) 504 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst)
483 #~ 505 #~
484 506
485 507 ## get a list of output files
486 ## get a lit of output files
487 files = [] 508 files = []
488 for dirpath, dirnames, filenames in os.walk(outFastaFold): 509 for dirpath, dirnames, filenames in os.walk(outFastaFold):
489 for file in filenames: 510 for file in filenames:
490 if file.endswith('.phy'): 511 if file.endswith('.phy'):
491 files.append( os.path.join(dirpath, file) ) 512 files.append( os.path.join(dirpath, file) )
498 ## the first file becomes the output 519 ## the first file becomes the output
499 file = files.pop(0) 520 file = files.pop(0)
500 shutil.move(file, outfile) 521 shutil.move(file, outfile)
501 522
502 ## rename/move the rest of the files 523 ## rename/move the rest of the files
503 for i, file in enumerate(files): 524 for file in files:
504 new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2) 525 name = os.path.basename(file)[:-4]
526 name = name.replace('_', '-')
527 new_filename = 'primary_{0}_{1}_visible_txt_?'.format(outfile_id, name)
505 new_pathname = os.path.join(files_dir, new_filename) 528 new_pathname = os.path.join(files_dir, new_filename)
506 shutil.move(file, new_pathname) 529 shutil.move(file, new_pathname)
507 530
508 return 0 531 return 0
509 532