genome_diversity: make_phylip.py comparison

comparison make_phylip.py @ 35:ea52b23f1141

Bug fixes for Draw variants, Phylip, and gd_d_tools

author	Richard Burhans <burhans@bx.psu.edu>
date	Wed, 20 Nov 2013 13:46:10 -0500
parents	a631c2f6d913
children	51cd0307fb70

comparison

equal deleted inserted replaced

-:f739a296a339
+:ea52b23f1141
 import argparse
 import errno
 import os
 import shutil
+from Population import Population
 def mkdir_p(path):
 try:
 os.makedirs(path)
 except OSError, e:
 outFastaf.write('\n'.join(seqPMLformat))
 outFastaf.close()
 #~
 return 0
+def pos_dict(gd_indivs_file, input_type):
+rv = {}
+p = Population()
+p.from_population_file(gd_indivs_file)
+for tag in p.tag_list():
+column, name = tag.split(':')
+column = int(column) - 1
+if input_type == 'gd_genotype':
+column -= 2
+rv[name] = column
+return rv
 def main():
 #~
 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0
 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True)
+parser.add_argument('--input_type',metavar='input type',type=str,help='the input file type (gd_snp or gd_genotype)',required=True)
 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True)
 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True)
 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True)
 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True)
 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True)
 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True)
 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True)
 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True)
 #~
 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False)
+parser.add_argument('--inputCover_type',metavar='input cover type',type=str,help='the cover input file type (gd_snp or gd_genotype)',required=False,default=False)
 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False)
 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False)
 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False)
 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False)
 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False)
 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False)
 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False)
 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False)
 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False)
 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False)
 args = parser.parse_args()
 inSNPf = args.input
+inSNPf_type = args.input_type
 outfile = args.output
 outfile_id = args.output_id
 outFastaFold = './out'
 files_dir = args.output_dir
 gd_indivs = args.gd_indivs
 cdsEndClmn = args.cdsEndClmn#coding sequence end column
 startExsClmn = args.startExsClmn#exons start column
 endExsClmn = args.endExsClmn#exons end column
 inputCover = args.inputCover
+inputCover_type = args.inputCover_type
 gd_indivs_cover = args.gd_indivs_cover
 cvrgTreshold = args.cvrgTreshold
 pxchrxCov = args.chrClmnCvrg
 pxposCov = args.posClmnCvrg
 pxntACov = args.refClmnCvrg
 pxntBCov = args.altrClmnCvrg
 indvlsPrctTrshld = args.indvlsPrctTrshld
 #print inputCover, gd_indivs_cover, cvrgTreshold
 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile))
-#~
+dPopsinSNPfPos = pos_dict(gd_indivs, inSNPf_type)
-dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()])
 #~ dPopsinSNPfPos.update({'ref':False})
 #~
 sPopsIntrst=set(dPopsinSNPfPos.keys())
 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...'
 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile)
 #~
 if  inputCover and gd_indivs_cover and cvrgTreshold>=0:
 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()])
-dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()]))
+dPopsinSNPfPos_cover.update(pos_dict(gd_indivs_cover, inputCover_type))
 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld)
 rvrse=False
 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)}
 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}}
 #~
 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...'
 #~
 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst)
 #~
+## get a list of output files
-## get a lit of output files
 files = []
 for dirpath, dirnames, filenames in os.walk(outFastaFold):
 for file in filenames:
 if file.endswith('.phy'):
 files.append( os.path.join(dirpath, file) )
 ## the first file becomes the output
 file = files.pop(0)
 shutil.move(file, outfile)
 ## rename/move the rest of the files
-for i, file in enumerate(files):
+for file in files:
-new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2)
+name = os.path.basename(file)[:-4]
+name = name.replace('_', '-')
+new_filename = 'primary_{0}_{1}_visible_txt_?'.format(outfile_id, name)
 new_pathname = os.path.join(files_dir, new_filename)
 shutil.move(file, new_pathname)
 return 0

Mercurial > repos > miller-lab > genome_diversity

comparison make_phylip.py @ 35:ea52b23f1141