Mercurial > repos > miller-lab > genome_diversity

diff make_phylip.py @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author: Richard Burhans <burhans@bx.psu.edu>
date: Fri, 20 Sep 2013 13:25:27 -0400
children: ea52b23f1141
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_phylip.py	Fri Sep 20 13:25:27 2013 -0400
@@ -0,0 +1,511 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       mkFastas.py
+#       
+#       Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
+#       
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#       
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#       
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+import argparse
+import errno
+import os
+import shutil
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+def revseq(seq):
+    seq=list(seq)
+    seq.reverse()
+    seq=''.join(seq)
+    return seq
+    
+def revComp(allPop):
+    dAllCompAll={'A':'T','T':'A','C':'G','G':'C','N':'N','M':'K','K':'M','R':'Y','Y':'R','W':'W','S':'S'}
+    allPopsComp=dAllCompAll[allPop]
+    return allPopsComp
+
+def rtrnCons(ntA,ntB):
+    srtdPairs=''.join(sorted([ntA,ntB]))
+    dpairsCons={'AC':'M', 'AG':'R', 'AT':'W', 'CG':'S', 'CT':'Y', 'GT':'K', 'AN':'A', 'CN':'C', 'GN':'G', 'NT':'T'}
+    cons=dpairsCons[srtdPairs]
+    return cons
+
+def rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB,fulldChrdPosdPopsAlllsInit=False,cvrgTreshold=False,indvlsPrctTrshld=False):
+    """
+    """
+    dChrdPosdPopsAlllsInit={}
+    seqref=[]
+    for eachl in open(inSNPf,'r'):
+        if eachl.strip() and eachl[0]!='#':
+            fllInfoSplt=eachl.splitlines()[0].split('\t')
+            chrx=fllInfoSplt[pxchrx]
+            pos=int(fllInfoSplt[pxpos])
+            ntA=fllInfoSplt[pxntA]
+            ntB=fllInfoSplt[pxntB]
+            seqref.append([pos,ntA])
+            dPopsAllls={}
+            if fulldChrdPosdPopsAlllsInit:
+                #~ 
+                cntIndv=0
+                #
+                try:
+                    fulldPopsAllls=fulldChrdPosdPopsAlllsInit[chrx][pos]
+                except:
+                    fulldPopsAllls=dict([(echPop,ntA) for echPop in dPopsinSNPfPos])            
+                #
+                for eachPop in dPopsinSNPfPos:
+                    clmnCvrg=dPopsinSNPfPos[eachPop]
+                    if clmnCvrg:
+                        eachPopCvrg=int(fllInfoSplt[clmnCvrg])
+                    else:
+                        #~ eachPopCvrg=0
+                        eachPopCvrg=cvrgTreshold
+                    if eachPopCvrg>=cvrgTreshold:
+                        dPopsAllls[eachPop]=fulldPopsAllls[eachPop]
+                        cntIndv+=1
+                    else:
+                        dPopsAllls[eachPop]='N'        
+                #~ 
+                if indvlsPrctTrshld>(cntIndv/float(len(dPopsinSNPfPos))):
+                    dPopsAllls=dict([(echPop,'N') for echPop in dPopsinSNPfPos])
+            #~ 
+            else:
+                for eachPop in dPopsinSNPfPos:
+                    if dPopsinSNPfPos[eachPop]:
+                        eachPopAll=int(fllInfoSplt[dPopsinSNPfPos[eachPop]])
+                        if eachPopAll==0:
+                            dPopsAllls[eachPop]=ntB
+                        elif eachPopAll==2:
+                            dPopsAllls[eachPop]=ntA
+                        elif eachPopAll==1:
+                            dPopsAllls[eachPop]=rtrnCons(ntA,ntB)
+                        else:
+                            dPopsAllls[eachPop]='N'
+                    else:
+                        dPopsAllls[eachPop]=ntA
+            try:
+                dChrdPosdPopsAlllsInit[chrx][pos]=dPopsAllls
+            except:
+                dChrdPosdPopsAlllsInit[chrx]={pos:dPopsAllls}
+    #~ 
+    seqref.sort()
+    startExs=[seqref[0][0]]
+    endExs=[seqref[-1][0]+1]
+    seqref=''.join(x[1] for x in seqref)
+    #~ 
+    return dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs
+
+
+def rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn):
+    """
+    """
+    dENSEMBLTchrxStEndEx={}
+    dChrdStrtEndENSEMBLT={}
+    for eachl in open(inUCSCfile,'r'):
+        if eachl.strip():
+            rvrse=False
+            allVls=eachl.split('\t')
+            txStart=allVls[txStartClmn]
+            txEnd=allVls[txEndClmn]
+            ENSEMBLT=allVls[geneNameClmn]
+            strand=allVls[strandClmn]
+            chrx=allVls[fchrClmn]
+            if cdsStartClmn and cdsEndClmn:
+                cdsStart=allVls[cdsStartClmn]
+                cdsEnd=allVls[cdsEndClmn]
+            if startExsClmn and endExsClmn:
+                startExs=allVls[startExsClmn]
+                endExs=allVls[endExsClmn]
+            if strand=='-':
+                rvrse=True
+            try:
+                dChrdStrtEndENSEMBLT[chrx][int(txStart),int(txEnd)]=ENSEMBLT
+            except:
+                try:
+                    dChrdStrtEndENSEMBLT[chrx]={(int(txStart),int(txEnd)):ENSEMBLT}
+                except:
+                    dChrdStrtEndENSEMBLT={chrx:{(int(txStart),int(txEnd)):ENSEMBLT}}
+            #~ 
+            if cdsStartClmn and cdsEndClmn and startExsClmn and endExsClmn:        
+                startExs,endExs=rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd)
+            else:
+                startExs,endExs=[int(txStart)],[int(txEnd)]
+            dENSEMBLTchrxStEndEx[ENSEMBLT]=(chrx,startExs,endExs,rvrse)
+    #~ 
+    dENSEMBLTseq={}
+    ENSEMBLTseqs=[(x.splitlines()[0],''.join(x.splitlines()[1:])) for x in open(inCDSfile).read().split('>') if x.strip()]
+    for ENSEMBLT,seq in ENSEMBLTseqs:
+        dENSEMBLTseq[ENSEMBLT]=seq
+    #~ 
+    dENSEMBLTseqChrStEnEx={}
+    for ENSEMBLT in dENSEMBLTchrxStEndEx:
+        chrx,startExs,endExs,rvrse=dENSEMBLTchrxStEndEx[ENSEMBLT]
+        addEseqChrStEnEx=True
+        try:
+            seq=dENSEMBLTseq[ENSEMBLT]
+            if rvrse:
+                seq=revseq(seq)
+        except:
+            addEseqChrStEnEx=False
+        if addEseqChrStEnEx:
+            dENSEMBLTseqChrStEnEx[ENSEMBLT]=(seq,chrx,startExs,endExs,rvrse)
+    return dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT
+    
+
+def rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit):
+    """
+    """
+    dENSEMBLTChrPosdAlls={}
+    dChrPosdPopsAllls={}
+    todel=set(dChrdPosdPopsAlllsInit.keys()).difference(set(dChrdStrtEndENSEMBLT.keys()))
+    for x in todel:
+        x=dChrdPosdPopsAlllsInit.pop(x)
+    #---
+    while len(dChrdPosdPopsAlllsInit)>0:
+        chrx=dChrdPosdPopsAlllsInit.keys()[0]
+        dStrtEndENSEMBLT=dChrdStrtEndENSEMBLT.pop(chrx)
+        dPosdPopsAllls=dChrdPosdPopsAlllsInit.pop(chrx)
+        #~ 
+        srtdStrtEndENSEMBLT=sorted(dStrtEndENSEMBLT.keys())
+        srtdPosdPopsAllls=sorted(dPosdPopsAllls.keys())
+        #~
+        pos=srtdPosdPopsAllls.pop(0)
+        strt,end=srtdStrtEndENSEMBLT.pop(0)
+        ENSEMBLT=dStrtEndENSEMBLT[strt,end]
+        dPopsAllls=dPosdPopsAllls[pos]
+        keePloop=True
+        #~ 
+        while keePloop:
+            if strt<=pos<=end:
+                for tmpstrt,tmpend in [(strt,end)]+srtdStrtEndENSEMBLT:
+                    if tmpstrt<=pos<=tmpend:
+                        dPopsAllls=dPosdPopsAllls[pos]
+                        dChrPosdPopsAllls[chrx,pos]=dPopsAllls
+                        try:
+                            dENSEMBLTChrPosdAlls[ENSEMBLT][chrx,pos]=dPopsAllls
+                        except:
+                            dENSEMBLTChrPosdAlls[ENSEMBLT]={(chrx,pos):dPopsAllls}
+                    else:
+                        continue
+                if len(srtdPosdPopsAllls)>0:
+                    pos=srtdPosdPopsAllls.pop(0)
+                    dPopsAllls=dPosdPopsAllls[pos]
+                else:
+                    keePloop=False
+            #~ 
+            elif pos<=strt:
+                if len(srtdPosdPopsAllls)>0:
+                    pos=srtdPosdPopsAllls.pop(0)
+                    dPopsAllls=dPosdPopsAllls[pos]
+                else:
+                    keePloop=False
+            else:
+                if len(srtdStrtEndENSEMBLT)>0:
+                    strt,end=srtdStrtEndENSEMBLT.pop(0)
+                    ENSEMBLT=dStrtEndENSEMBLT[strt,end]
+                else:
+                    keePloop=False
+    return dENSEMBLTChrPosdAlls,dChrPosdPopsAllls
+
+def rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd):
+    """
+    """
+    cdsStart,cdsEnd=int(cdsStart),int(cdsEnd)
+    crrctdstartExs=set([int(x) for x in startExs.split(',') if x.strip()])
+    crrctdendExs=set([int(x) for x in endExs.split(',') if x.strip()])
+    crrctdstartExs.add(cdsStart)
+    crrctdendExs.add(cdsEnd)
+    sStartDel=set()
+    sEndDel=set()
+    #~ 
+    for echvl in crrctdstartExs:
+        if echvl<cdsStart or echvl>cdsEnd:
+            sStartDel.add(echvl)
+    #~ 
+    for echvl in crrctdendExs:
+        if echvl<cdsStart or echvl>cdsEnd:
+            sEndDel.add(echvl)
+    #~ 
+    return sorted(crrctdstartExs.difference(sStartDel)),sorted(crrctdendExs.difference(sEndDel))
+
+def rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT):
+    """
+    """
+    exnIntrvl=zip(startExs,endExs)
+    CDSinitPos=exnIntrvl[0][0]
+    dexnIntrvlSeq={}    
+    for exStart,exEnd in exnIntrvl:
+        lenEx=exEnd-exStart
+        dexnIntrvlSeq[exStart,exEnd]=seq[:lenEx]
+        seq=seq[lenEx:]
+        
+    ldexnIntrvlSeq=len(dexnIntrvlSeq)
+    #~ 
+    dPopsFasta={}
+    #~ 
+    strePos=set()
+    dStrePosAbsPos={}
+    tmpAcmltdPos=0
+    #~ 
+    exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0]
+    seq=dexnIntrvlSeq.pop((exStart,exEnd))
+    chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
+    dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
+    tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls])
+    cntExns=0
+    while True:
+        if  exStart<=pos<=exEnd-1:
+            relPos=tmpAcmltdPos+pos-exStart
+            strePos.add(relPos)
+            dStrePosAbsPos[relPos]=pos
+            for echPop in tmpdPopsFasta:
+                allPop=dPopsAllls[echPop]
+                if rvrse:
+                    allPop=revComp(allPop)
+                tmpdPopsFasta[echPop][pos-exStart]=allPop
+            if len(dChrPosdPopsAllls)>0:
+                chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
+                dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
+            else:
+                pos=endExs[-1]+100#max pos of exns
+        elif pos<exStart:
+            if len(dChrPosdPopsAllls)>0:
+                chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
+                dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
+            else:
+                pos=endExs[-1]+100#max pos of exns
+        elif pos>exEnd-1:# or len(dChrPosdPopsAllls)==0:
+            for echPop in tmpdPopsFasta:
+                try:
+                    dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop])
+                except:
+                    dPopsFasta[echPop]=''.join(tmpdPopsFasta[echPop])
+            cntExns+=1
+            tmpAcmltdPos+=len(seq)
+            if len(dexnIntrvlSeq)>0:
+                exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0]
+                seq=dexnIntrvlSeq.pop((exStart,exEnd))
+                tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls])
+            else:
+                break
+    if ldexnIntrvlSeq!=cntExns:
+        for echPop in tmpdPopsFasta:
+            dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop])
+    #~ 
+    lchrStartexEndpos=[]
+    if rvrse:
+        dPopsFasta=dict([(echPop,revseq(dPopsFasta[echPop])) for echPop in dPopsFasta])#[echPop]+=''.join(tmpdPopsFasta[echPop])
+        for ePos in strePos:
+            lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(tmpAcmltdPos-ePos-1),str(dStrePosAbsPos[ePos])]))
+    else:
+        for ePos in strePos:
+            lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(ePos),str(dStrePosAbsPos[ePos])]))
+    #~ 
+    return dPopsFasta,lchrStartexEndpos
+
+def rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls):
+    """
+    """
+    dENSEMBLTPopsFasta={}
+    lchrStartexEndposAll=[]
+    #~ 
+    sENSEMBLTcmmn=set(dENSEMBLTChrPosdAlls.keys()).intersection(set(dENSEMBLTseqChrStEnEx.keys()))#sENSEMBLTcmmn between UCSC and ENSEMBLE
+    #~ 
+    for ENSEMBLT in sENSEMBLTcmmn:
+        seq,chrx,startExs,endExs,rvrse=dENSEMBLTseqChrStEnEx[ENSEMBLT]
+        dChrPosdPopsAllls=dENSEMBLTChrPosdAlls[ENSEMBLT]
+        if len(startExs)>0 and len(endExs)>0:
+            dPopsFasta,lchrStartexEndpos=rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT)
+            lchrStartexEndposAll.extend(lchrStartexEndpos)
+            if dPopsFasta:#to correct a bug of the input table, in cases in which endExons<startExn (!). See ENSCAFT00000000145 (MC4R) in canFam2 for example.
+                dENSEMBLTPopsFasta[ENSEMBLT]=dPopsFasta
+    return dENSEMBLTPopsFasta,lchrStartexEndposAll
+
+
+
+def rtrnPhy(dPopsFasta,ENSEMBLT):
+    """
+    """
+    dPopsFormPhy={}
+    for eachPop in dPopsFasta:
+        hader='%s'%eachPop
+        #~ hader='>%s'%eachPop
+        seq=dPopsFasta[eachPop]
+        formtd='\t'.join([hader,seq])
+        #~ formtd='\n'.join([hader,seq])
+        dPopsFormPhy[eachPop]=formtd
+    #~ 
+    return dPopsFormPhy,len(seq)
+
+def wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst):
+    """
+    """
+    ENSEMBLTKaKs=[]
+    nonHeader=True
+    cnt=0
+    lENSEMBLT=len(dENSEMBLTPopsFasta)
+    #~     
+    for ENSEMBLT in sorted(dENSEMBLTPopsFasta.keys()):
+        cnt+=1
+        dPopsFasta=dENSEMBLTPopsFasta[ENSEMBLT]
+        dPopsFormPhy,lenseq=rtrnPhy(dPopsFasta,ENSEMBLT)
+        #~ 
+        seqPMLformat=['%s %s'%(len(dPopsFormPhy),lenseq)]#generate new PHYML sequence
+        #~ seqPMLformat=[]#generate new PHYML sequence
+        for namex in sorted(sPopsIntrst):
+            seqPMLformat.append(dPopsFormPhy[namex])
+        #~ 
+        mkdir_p(outFastaFold)
+        outFastaf=os.path.join(outFastaFold,'%s.phy'%ENSEMBLT)
+        outFastaf=open(outFastaf,'w')
+        outFastaf.write('\n'.join(seqPMLformat))
+        outFastaf.close()
+        #~ 
+    return 0
+
+def main():
+    #~ 
+    #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0
+    parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
+    parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True)    
+    parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True)
+    parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True)
+    parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True)
+    parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True)
+    parser.add_argument('--output',metavar='output',type=str,help='the output',required=True)
+    parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True)
+    parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True)
+    parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True)
+    #~ 
+    parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False)
+    parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False)
+    parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False)
+    parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False)
+    parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False)
+    parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False)
+    parser.add_argument('--altrClmnCvrg',metavar='int',type=int,help='the column with the derived nucleotide in the input coverage file.',required=False,default=False)
+    parser.add_argument('--indvlsPrctTrshld',metavar='int',type=float,help='the percentage of individual above which nucleotides are included, else "N".',required=False,default=False)
+    #~ 
+    parser.add_argument('--sequence',metavar='input fasta file',type=str,help='the input file with the sequence whose SNPs are in the input.',required=False,default=False)
+    parser.add_argument('--gene_info',metavar='input interval file',type=str,help='the input interval file with the the information on the genes.',required=False,default=False)
+    parser.add_argument('--fchrClmn',metavar='int',type=int,help='the column with the chromosome in the gene_info file.',required=False,default=False)
+    parser.add_argument('--txStartClmn',metavar='int',type=int,help='the column with the transcript start column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--txEndClmn',metavar='int',type=int,help='the column with the transcript end column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--strandClmn',metavar='int',type=int,help='the column with the strand column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False)
+    parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False)
+
+    args = parser.parse_args()
+
+    inSNPf = args.input
+    outfile = args.output
+    outfile_id = args.output_id
+    outFastaFold = './out'
+    files_dir = args.output_dir
+    gd_indivs = args.gd_indivs    
+    pxchrx = args.chrClmn
+    pxpos = args.posClmn
+    pxntA = args.refClmn
+    pxntB = args.altrClmn
+    
+    
+    inCDSfile = args.sequence
+    inUCSCfile = args.gene_info
+    fchrClmn = args.fchrClmn#chromosome column
+    txStartClmn = args.txStartClmn#transcript start column
+    txEndClmn = args.txEndClmn#transcript end column
+    strandClmn = args.strandClmn#strand column
+    geneNameClmn = args.geneNameClmn#gene name column
+    cdsStartClmn = args.cdsStartClmn#coding sequence start column
+    cdsEndClmn = args.cdsEndClmn#coding sequence end column
+    startExsClmn = args.startExsClmn#exons start column
+    endExsClmn = args.endExsClmn#exons end column
+    
+    inputCover = args.inputCover
+    gd_indivs_cover = args.gd_indivs_cover
+    cvrgTreshold = args.cvrgTreshold
+    pxchrxCov = args.chrClmnCvrg
+    pxposCov = args.posClmnCvrg
+    pxntACov = args.refClmnCvrg
+    pxntBCov = args.altrClmnCvrg
+    indvlsPrctTrshld = args.indvlsPrctTrshld
+    
+    #print inputCover, gd_indivs_cover, cvrgTreshold
+    
+    assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile))
+    
+    #~ 
+    dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()])
+    #~ dPopsinSNPfPos.update({'ref':False})
+    #~ 
+    sPopsIntrst=set(dPopsinSNPfPos.keys())
+    dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...'
+    #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile)
+    #~
+    if  inputCover and gd_indivs_cover and cvrgTreshold>=0:
+        dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()])
+        dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()]))
+        dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld)
+        rvrse=False
+        dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)}
+        dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}}
+    #~ 
+    elif inCDSfile and inUCSCfile:
+        dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn)#~ print '2. Getting transcripts and exons information...'        
+    #~ 
+    dENSEMBLTChrPosdAlls,dChrPosdPopsAllls=rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit)#~ print '3. Getting fixed alleles in exons...'
+    #~ 
+    dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...'
+    #~ 
+    wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst)
+    #~ 
+
+
+    ## get a lit of output files
+    files = []
+    for dirpath, dirnames, filenames in os.walk(outFastaFold):
+        for file in filenames:
+            if file.endswith('.phy'):
+                files.append( os.path.join(dirpath, file) )
+        del dirnames[:]
+
+    if len(files) == 0:
+        with open(outfile, 'w') as ofh:
+            print >> ofh, 'No output.'
+    else:
+        ## the first file becomes the output
+        file = files.pop(0)
+        shutil.move(file, outfile)
+
+        ## rename/move the rest of the files
+        for i, file in enumerate(files):
+            new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2)
+            new_pathname = os.path.join(files_dir, new_filename)
+            shutil.move(file, new_pathname)
+
+    return 0
+
+if __name__ == '__main__':
+    main()
author	Richard Burhans <burhans@bx.psu.edu>
date	Fri, 20 Sep 2013 13:25:27 -0400
parents
children	ea52b23f1141