annotate BSseeker2/bs_index/wg_build.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 from bs_utils.utils import *
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4 def wg_build(fasta_file, build_command, ref_path, aligner):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5
1
weilong-guo
parents: 0
diff changeset
6 # ref_path is a string that contains the directory where the reference genomes are stored with
weilong-guo
parents: 0
diff changeset
7 # the input Fasta filename appended
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 ref_path = os.path.join(ref_path,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 os.path.split(fasta_file)[1] + '_'+aligner)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11 clear_dir(ref_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12 #---------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 # 1. First get the complementary genome (also do the reverse)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 # 2. Then do CT and GA conversions
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15 #---------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17 open_log(os.path.join(ref_path, 'log'))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 refd = {}
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 w_c2t = open(os.path.join(ref_path, 'W_C2T.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 c_c2t = open(os.path.join(ref_path, 'C_C2T.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22 w_g2a = open(os.path.join(ref_path, 'W_G2A.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23 c_g2a = open(os.path.join(ref_path, 'C_G2A.fa'),'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 for chrom_id, chrom_seq in read_fasta(fasta_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25 serialize(chrom_seq, os.path.join(ref_path, chrom_id))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 refd[chrom_id] = len(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28 w_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29 w_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 chrom_seq = reverse_compl_seq(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 c_c2t.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("C","T")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 c_g2a.write('>%s\n%s\n' % (chrom_id, chrom_seq.replace("G","A")))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 elapsed('Preprocessing '+chrom_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 for outf in [w_c2t, c_c2t, w_g2a, c_g2a]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 outf.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 serialize(refd, os.path.join(ref_path,"refname"))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 elapsed('Genome preprocessing')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 # append ref_path to all elements of to_bowtie
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 to_bowtie = map(lambda f: os.path.join(ref_path, f), ['W_C2T', 'W_G2A', 'C_C2T', 'C_G2A'])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 # start bowtie-build for all converted genomes and wait for the processes to finish
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 run_in_parallel([(build_command % { 'fname' : fname }, fname+'.log') for fname in to_bowtie])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50 # delete fasta files of converted genomes
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51 if aligner != "rmap" :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 delete_files(f+'.fa' for f in to_bowtie)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 elapsed('Done')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55