comparison make_phylip.py @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents
children ea52b23f1141
comparison
equal deleted inserted replaced
30:4188853b940b 31:a631c2f6d913
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # mkFastas.py
5 #
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 # MA 02110-1301, USA.
22
23 import argparse
24 import errno
25 import os
26 import shutil
27
28 def mkdir_p(path):
29 try:
30 os.makedirs(path)
31 except OSError, e:
32 if e.errno <> errno.EEXIST:
33 raise
34
35 def revseq(seq):
36 seq=list(seq)
37 seq.reverse()
38 seq=''.join(seq)
39 return seq
40
41 def revComp(allPop):
42 dAllCompAll={'A':'T','T':'A','C':'G','G':'C','N':'N','M':'K','K':'M','R':'Y','Y':'R','W':'W','S':'S'}
43 allPopsComp=dAllCompAll[allPop]
44 return allPopsComp
45
46 def rtrnCons(ntA,ntB):
47 srtdPairs=''.join(sorted([ntA,ntB]))
48 dpairsCons={'AC':'M', 'AG':'R', 'AT':'W', 'CG':'S', 'CT':'Y', 'GT':'K', 'AN':'A', 'CN':'C', 'GN':'G', 'NT':'T'}
49 cons=dpairsCons[srtdPairs]
50 return cons
51
52 def rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB,fulldChrdPosdPopsAlllsInit=False,cvrgTreshold=False,indvlsPrctTrshld=False):
53 """
54 """
55 dChrdPosdPopsAlllsInit={}
56 seqref=[]
57 for eachl in open(inSNPf,'r'):
58 if eachl.strip() and eachl[0]!='#':
59 fllInfoSplt=eachl.splitlines()[0].split('\t')
60 chrx=fllInfoSplt[pxchrx]
61 pos=int(fllInfoSplt[pxpos])
62 ntA=fllInfoSplt[pxntA]
63 ntB=fllInfoSplt[pxntB]
64 seqref.append([pos,ntA])
65 dPopsAllls={}
66 if fulldChrdPosdPopsAlllsInit:
67 #~
68 cntIndv=0
69 #
70 try:
71 fulldPopsAllls=fulldChrdPosdPopsAlllsInit[chrx][pos]
72 except:
73 fulldPopsAllls=dict([(echPop,ntA) for echPop in dPopsinSNPfPos])
74 #
75 for eachPop in dPopsinSNPfPos:
76 clmnCvrg=dPopsinSNPfPos[eachPop]
77 if clmnCvrg:
78 eachPopCvrg=int(fllInfoSplt[clmnCvrg])
79 else:
80 #~ eachPopCvrg=0
81 eachPopCvrg=cvrgTreshold
82 if eachPopCvrg>=cvrgTreshold:
83 dPopsAllls[eachPop]=fulldPopsAllls[eachPop]
84 cntIndv+=1
85 else:
86 dPopsAllls[eachPop]='N'
87 #~
88 if indvlsPrctTrshld>(cntIndv/float(len(dPopsinSNPfPos))):
89 dPopsAllls=dict([(echPop,'N') for echPop in dPopsinSNPfPos])
90 #~
91 else:
92 for eachPop in dPopsinSNPfPos:
93 if dPopsinSNPfPos[eachPop]:
94 eachPopAll=int(fllInfoSplt[dPopsinSNPfPos[eachPop]])
95 if eachPopAll==0:
96 dPopsAllls[eachPop]=ntB
97 elif eachPopAll==2:
98 dPopsAllls[eachPop]=ntA
99 elif eachPopAll==1:
100 dPopsAllls[eachPop]=rtrnCons(ntA,ntB)
101 else:
102 dPopsAllls[eachPop]='N'
103 else:
104 dPopsAllls[eachPop]=ntA
105 try:
106 dChrdPosdPopsAlllsInit[chrx][pos]=dPopsAllls
107 except:
108 dChrdPosdPopsAlllsInit[chrx]={pos:dPopsAllls}
109 #~
110 seqref.sort()
111 startExs=[seqref[0][0]]
112 endExs=[seqref[-1][0]+1]
113 seqref=''.join(x[1] for x in seqref)
114 #~
115 return dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs
116
117
118 def rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn):
119 """
120 """
121 dENSEMBLTchrxStEndEx={}
122 dChrdStrtEndENSEMBLT={}
123 for eachl in open(inUCSCfile,'r'):
124 if eachl.strip():
125 rvrse=False
126 allVls=eachl.split('\t')
127 txStart=allVls[txStartClmn]
128 txEnd=allVls[txEndClmn]
129 ENSEMBLT=allVls[geneNameClmn]
130 strand=allVls[strandClmn]
131 chrx=allVls[fchrClmn]
132 if cdsStartClmn and cdsEndClmn:
133 cdsStart=allVls[cdsStartClmn]
134 cdsEnd=allVls[cdsEndClmn]
135 if startExsClmn and endExsClmn:
136 startExs=allVls[startExsClmn]
137 endExs=allVls[endExsClmn]
138 if strand=='-':
139 rvrse=True
140 try:
141 dChrdStrtEndENSEMBLT[chrx][int(txStart),int(txEnd)]=ENSEMBLT
142 except:
143 try:
144 dChrdStrtEndENSEMBLT[chrx]={(int(txStart),int(txEnd)):ENSEMBLT}
145 except:
146 dChrdStrtEndENSEMBLT={chrx:{(int(txStart),int(txEnd)):ENSEMBLT}}
147 #~
148 if cdsStartClmn and cdsEndClmn and startExsClmn and endExsClmn:
149 startExs,endExs=rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd)
150 else:
151 startExs,endExs=[int(txStart)],[int(txEnd)]
152 dENSEMBLTchrxStEndEx[ENSEMBLT]=(chrx,startExs,endExs,rvrse)
153 #~
154 dENSEMBLTseq={}
155 ENSEMBLTseqs=[(x.splitlines()[0],''.join(x.splitlines()[1:])) for x in open(inCDSfile).read().split('>') if x.strip()]
156 for ENSEMBLT,seq in ENSEMBLTseqs:
157 dENSEMBLTseq[ENSEMBLT]=seq
158 #~
159 dENSEMBLTseqChrStEnEx={}
160 for ENSEMBLT in dENSEMBLTchrxStEndEx:
161 chrx,startExs,endExs,rvrse=dENSEMBLTchrxStEndEx[ENSEMBLT]
162 addEseqChrStEnEx=True
163 try:
164 seq=dENSEMBLTseq[ENSEMBLT]
165 if rvrse:
166 seq=revseq(seq)
167 except:
168 addEseqChrStEnEx=False
169 if addEseqChrStEnEx:
170 dENSEMBLTseqChrStEnEx[ENSEMBLT]=(seq,chrx,startExs,endExs,rvrse)
171 return dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT
172
173
174 def rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit):
175 """
176 """
177 dENSEMBLTChrPosdAlls={}
178 dChrPosdPopsAllls={}
179 todel=set(dChrdPosdPopsAlllsInit.keys()).difference(set(dChrdStrtEndENSEMBLT.keys()))
180 for x in todel:
181 x=dChrdPosdPopsAlllsInit.pop(x)
182 #---
183 while len(dChrdPosdPopsAlllsInit)>0:
184 chrx=dChrdPosdPopsAlllsInit.keys()[0]
185 dStrtEndENSEMBLT=dChrdStrtEndENSEMBLT.pop(chrx)
186 dPosdPopsAllls=dChrdPosdPopsAlllsInit.pop(chrx)
187 #~
188 srtdStrtEndENSEMBLT=sorted(dStrtEndENSEMBLT.keys())
189 srtdPosdPopsAllls=sorted(dPosdPopsAllls.keys())
190 #~
191 pos=srtdPosdPopsAllls.pop(0)
192 strt,end=srtdStrtEndENSEMBLT.pop(0)
193 ENSEMBLT=dStrtEndENSEMBLT[strt,end]
194 dPopsAllls=dPosdPopsAllls[pos]
195 keePloop=True
196 #~
197 while keePloop:
198 if strt<=pos<=end:
199 for tmpstrt,tmpend in [(strt,end)]+srtdStrtEndENSEMBLT:
200 if tmpstrt<=pos<=tmpend:
201 dPopsAllls=dPosdPopsAllls[pos]
202 dChrPosdPopsAllls[chrx,pos]=dPopsAllls
203 try:
204 dENSEMBLTChrPosdAlls[ENSEMBLT][chrx,pos]=dPopsAllls
205 except:
206 dENSEMBLTChrPosdAlls[ENSEMBLT]={(chrx,pos):dPopsAllls}
207 else:
208 continue
209 if len(srtdPosdPopsAllls)>0:
210 pos=srtdPosdPopsAllls.pop(0)
211 dPopsAllls=dPosdPopsAllls[pos]
212 else:
213 keePloop=False
214 #~
215 elif pos<=strt:
216 if len(srtdPosdPopsAllls)>0:
217 pos=srtdPosdPopsAllls.pop(0)
218 dPopsAllls=dPosdPopsAllls[pos]
219 else:
220 keePloop=False
221 else:
222 if len(srtdStrtEndENSEMBLT)>0:
223 strt,end=srtdStrtEndENSEMBLT.pop(0)
224 ENSEMBLT=dStrtEndENSEMBLT[strt,end]
225 else:
226 keePloop=False
227 return dENSEMBLTChrPosdAlls,dChrPosdPopsAllls
228
229 def rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd):
230 """
231 """
232 cdsStart,cdsEnd=int(cdsStart),int(cdsEnd)
233 crrctdstartExs=set([int(x) for x in startExs.split(',') if x.strip()])
234 crrctdendExs=set([int(x) for x in endExs.split(',') if x.strip()])
235 crrctdstartExs.add(cdsStart)
236 crrctdendExs.add(cdsEnd)
237 sStartDel=set()
238 sEndDel=set()
239 #~
240 for echvl in crrctdstartExs:
241 if echvl<cdsStart or echvl>cdsEnd:
242 sStartDel.add(echvl)
243 #~
244 for echvl in crrctdendExs:
245 if echvl<cdsStart or echvl>cdsEnd:
246 sEndDel.add(echvl)
247 #~
248 return sorted(crrctdstartExs.difference(sStartDel)),sorted(crrctdendExs.difference(sEndDel))
249
250 def rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT):
251 """
252 """
253 exnIntrvl=zip(startExs,endExs)
254 CDSinitPos=exnIntrvl[0][0]
255 dexnIntrvlSeq={}
256 for exStart,exEnd in exnIntrvl:
257 lenEx=exEnd-exStart
258 dexnIntrvlSeq[exStart,exEnd]=seq[:lenEx]
259 seq=seq[lenEx:]
260
261 ldexnIntrvlSeq=len(dexnIntrvlSeq)
262 #~
263 dPopsFasta={}
264 #~
265 strePos=set()
266 dStrePosAbsPos={}
267 tmpAcmltdPos=0
268 #~
269 exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0]
270 seq=dexnIntrvlSeq.pop((exStart,exEnd))
271 chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
272 dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
273 tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls])
274 cntExns=0
275 while True:
276 if exStart<=pos<=exEnd-1:
277 relPos=tmpAcmltdPos+pos-exStart
278 strePos.add(relPos)
279 dStrePosAbsPos[relPos]=pos
280 for echPop in tmpdPopsFasta:
281 allPop=dPopsAllls[echPop]
282 if rvrse:
283 allPop=revComp(allPop)
284 tmpdPopsFasta[echPop][pos-exStart]=allPop
285 if len(dChrPosdPopsAllls)>0:
286 chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
287 dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
288 else:
289 pos=endExs[-1]+100#max pos of exns
290 elif pos<exStart:
291 if len(dChrPosdPopsAllls)>0:
292 chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
293 dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
294 else:
295 pos=endExs[-1]+100#max pos of exns
296 elif pos>exEnd-1:# or len(dChrPosdPopsAllls)==0:
297 for echPop in tmpdPopsFasta:
298 try:
299 dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop])
300 except:
301 dPopsFasta[echPop]=''.join(tmpdPopsFasta[echPop])
302 cntExns+=1
303 tmpAcmltdPos+=len(seq)
304 if len(dexnIntrvlSeq)>0:
305 exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0]
306 seq=dexnIntrvlSeq.pop((exStart,exEnd))
307 tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls])
308 else:
309 break
310 if ldexnIntrvlSeq!=cntExns:
311 for echPop in tmpdPopsFasta:
312 dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop])
313 #~
314 lchrStartexEndpos=[]
315 if rvrse:
316 dPopsFasta=dict([(echPop,revseq(dPopsFasta[echPop])) for echPop in dPopsFasta])#[echPop]+=''.join(tmpdPopsFasta[echPop])
317 for ePos in strePos:
318 lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(tmpAcmltdPos-ePos-1),str(dStrePosAbsPos[ePos])]))
319 else:
320 for ePos in strePos:
321 lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(ePos),str(dStrePosAbsPos[ePos])]))
322 #~
323 return dPopsFasta,lchrStartexEndpos
324
325 def rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls):
326 """
327 """
328 dENSEMBLTPopsFasta={}
329 lchrStartexEndposAll=[]
330 #~
331 sENSEMBLTcmmn=set(dENSEMBLTChrPosdAlls.keys()).intersection(set(dENSEMBLTseqChrStEnEx.keys()))#sENSEMBLTcmmn between UCSC and ENSEMBLE
332 #~
333 for ENSEMBLT in sENSEMBLTcmmn:
334 seq,chrx,startExs,endExs,rvrse=dENSEMBLTseqChrStEnEx[ENSEMBLT]
335 dChrPosdPopsAllls=dENSEMBLTChrPosdAlls[ENSEMBLT]
336 if len(startExs)>0 and len(endExs)>0:
337 dPopsFasta,lchrStartexEndpos=rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT)
338 lchrStartexEndposAll.extend(lchrStartexEndpos)
339 if dPopsFasta:#to correct a bug of the input table, in cases in which endExons<startExn (!). See ENSCAFT00000000145 (MC4R) in canFam2 for example.
340 dENSEMBLTPopsFasta[ENSEMBLT]=dPopsFasta
341 return dENSEMBLTPopsFasta,lchrStartexEndposAll
342
343
344
345 def rtrnPhy(dPopsFasta,ENSEMBLT):
346 """
347 """
348 dPopsFormPhy={}
349 for eachPop in dPopsFasta:
350 hader='%s'%eachPop
351 #~ hader='>%s'%eachPop
352 seq=dPopsFasta[eachPop]
353 formtd='\t'.join([hader,seq])
354 #~ formtd='\n'.join([hader,seq])
355 dPopsFormPhy[eachPop]=formtd
356 #~
357 return dPopsFormPhy,len(seq)
358
359 def wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst):
360 """
361 """
362 ENSEMBLTKaKs=[]
363 nonHeader=True
364 cnt=0
365 lENSEMBLT=len(dENSEMBLTPopsFasta)
366 #~
367 for ENSEMBLT in sorted(dENSEMBLTPopsFasta.keys()):
368 cnt+=1
369 dPopsFasta=dENSEMBLTPopsFasta[ENSEMBLT]
370 dPopsFormPhy,lenseq=rtrnPhy(dPopsFasta,ENSEMBLT)
371 #~
372 seqPMLformat=['%s %s'%(len(dPopsFormPhy),lenseq)]#generate new PHYML sequence
373 #~ seqPMLformat=[]#generate new PHYML sequence
374 for namex in sorted(sPopsIntrst):
375 seqPMLformat.append(dPopsFormPhy[namex])
376 #~
377 mkdir_p(outFastaFold)
378 outFastaf=os.path.join(outFastaFold,'%s.phy'%ENSEMBLT)
379 outFastaf=open(outFastaf,'w')
380 outFastaf.write('\n'.join(seqPMLformat))
381 outFastaf.close()
382 #~
383 return 0
384
385 def main():
386 #~
387 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0
388 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
389 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True)
390 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True)
391 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True)
392 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True)
393 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True)
394 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True)
395 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True)
396 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True)
397 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True)
398 #~
399 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False)
400 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False)
401 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False)
402 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False)
403 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False)
404 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False)
405 parser.add_argument('--altrClmnCvrg',metavar='int',type=int,help='the column with the derived nucleotide in the input coverage file.',required=False,default=False)
406 parser.add_argument('--indvlsPrctTrshld',metavar='int',type=float,help='the percentage of individual above which nucleotides are included, else "N".',required=False,default=False)
407 #~
408 parser.add_argument('--sequence',metavar='input fasta file',type=str,help='the input file with the sequence whose SNPs are in the input.',required=False,default=False)
409 parser.add_argument('--gene_info',metavar='input interval file',type=str,help='the input interval file with the the information on the genes.',required=False,default=False)
410 parser.add_argument('--fchrClmn',metavar='int',type=int,help='the column with the chromosome in the gene_info file.',required=False,default=False)
411 parser.add_argument('--txStartClmn',metavar='int',type=int,help='the column with the transcript start column in the gene_info file.',required=False,default=False)
412 parser.add_argument('--txEndClmn',metavar='int',type=int,help='the column with the transcript end column in the gene_info file.',required=False,default=False)
413 parser.add_argument('--strandClmn',metavar='int',type=int,help='the column with the strand column in the gene_info file.',required=False,default=False)
414 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False)
415 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False)
416 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False)
417 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False)
418 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False)
419
420 args = parser.parse_args()
421
422 inSNPf = args.input
423 outfile = args.output
424 outfile_id = args.output_id
425 outFastaFold = './out'
426 files_dir = args.output_dir
427 gd_indivs = args.gd_indivs
428 pxchrx = args.chrClmn
429 pxpos = args.posClmn
430 pxntA = args.refClmn
431 pxntB = args.altrClmn
432
433
434 inCDSfile = args.sequence
435 inUCSCfile = args.gene_info
436 fchrClmn = args.fchrClmn#chromosome column
437 txStartClmn = args.txStartClmn#transcript start column
438 txEndClmn = args.txEndClmn#transcript end column
439 strandClmn = args.strandClmn#strand column
440 geneNameClmn = args.geneNameClmn#gene name column
441 cdsStartClmn = args.cdsStartClmn#coding sequence start column
442 cdsEndClmn = args.cdsEndClmn#coding sequence end column
443 startExsClmn = args.startExsClmn#exons start column
444 endExsClmn = args.endExsClmn#exons end column
445
446 inputCover = args.inputCover
447 gd_indivs_cover = args.gd_indivs_cover
448 cvrgTreshold = args.cvrgTreshold
449 pxchrxCov = args.chrClmnCvrg
450 pxposCov = args.posClmnCvrg
451 pxntACov = args.refClmnCvrg
452 pxntBCov = args.altrClmnCvrg
453 indvlsPrctTrshld = args.indvlsPrctTrshld
454
455 #print inputCover, gd_indivs_cover, cvrgTreshold
456
457 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile))
458
459 #~
460 dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()])
461 #~ dPopsinSNPfPos.update({'ref':False})
462 #~
463 sPopsIntrst=set(dPopsinSNPfPos.keys())
464 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...'
465 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile)
466 #~
467 if inputCover and gd_indivs_cover and cvrgTreshold>=0:
468 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()])
469 dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()]))
470 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld)
471 rvrse=False
472 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)}
473 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}}
474 #~
475 elif inCDSfile and inUCSCfile:
476 dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn)#~ print '2. Getting transcripts and exons information...'
477 #~
478 dENSEMBLTChrPosdAlls,dChrPosdPopsAllls=rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit)#~ print '3. Getting fixed alleles in exons...'
479 #~
480 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...'
481 #~
482 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst)
483 #~
484
485
486 ## get a lit of output files
487 files = []
488 for dirpath, dirnames, filenames in os.walk(outFastaFold):
489 for file in filenames:
490 if file.endswith('.phy'):
491 files.append( os.path.join(dirpath, file) )
492 del dirnames[:]
493
494 if len(files) == 0:
495 with open(outfile, 'w') as ofh:
496 print >> ofh, 'No output.'
497 else:
498 ## the first file becomes the output
499 file = files.pop(0)
500 shutil.move(file, outfile)
501
502 ## rename/move the rest of the files
503 for i, file in enumerate(files):
504 new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2)
505 new_pathname = os.path.join(files_dir, new_filename)
506 shutil.move(file, new_pathname)
507
508 return 0
509
510 if __name__ == '__main__':
511 main()