annotate get_contigs_from_re_archive.py @ 2:3f8ae272f4f3 draft

Uploaded
author petr-novak
date Thu, 07 Oct 2021 07:29:59 +0000
parents cf3cea0a3039
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
2 '''
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
3 parse .aln file - output from cap3 program. Output is fasta file and
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
4 profile file
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
5 '''
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
6 import argparse
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
7 import re
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
8 import zipfile
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
9 import tempfile
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
10 import textwrap
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
11
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
12 def parse_args():
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
13 '''Argument parsin'''
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
14 description = """
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
15 parsing cap3 assembly aln output
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
16 """
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
17
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
18 parser = argparse.ArgumentParser(
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
19 description=description, formatter_class=argparse.RawTextHelpFormatter)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
20 parser.add_argument('-re',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
21 '--re_file',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
22 default=None,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
23 required=True,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
24 help="RepeatExlorer archive or directory",
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
25 type=str,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
26 action='store')
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
27 parser.add_argument('-f',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
28 '--fasta',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
29 default=None,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
30 required=True,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
31 help="fasta output file name",
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
32 type=str,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
33 action='store')
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
34 parser.add_argument('-m',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
35 '--min_coverage',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
36 default=5,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
37 required=False,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
38 help="minimum contig coverage",
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
39 type=int,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
40 action="store")
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
41 parser.add_argument('-L',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
42 '--min_contig_length',
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
43 default=50,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
44 required=False,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
45 help="minimum contig length",
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
46 type=int,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
47 action="store")
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
48 return parser.parse_args()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
49
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
50
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
51 def get_header(f):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
52 aln_header = " . : . : . : . : . : . :"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
53 contig_lead = "******************"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
54 aln_start = -1
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
55 while True:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
56 line = f.readline()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
57 if not line:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
58 return None, None
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
59 if line[0:18] == contig_lead:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
60 line2 = f.readline()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
61 else:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
62 continue
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
63 if aln_header in line2:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
64 aln_start = line2.index(aln_header)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
65 break
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
66 contig_name = line.split()[1] + line.split()[2]
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
67 return contig_name, aln_start
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
68
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
69
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
70 def segment_start(f):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
71 pos = f.tell()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
72 line = f.readline()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
73 # detect next contig or end of file
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
74 if "********" in line or line == "" or "Number of segment pairs = " in line:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
75 segment = False
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
76 else:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
77 segment = True
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
78 f.seek(pos)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
79 return segment
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
80
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
81
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
82 def get_segment(f, seq_start):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
83 if not segment_start(f):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
84 return None, None
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
85 aln = []
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
86 while True:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
87 line = f.readline()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
88 if ". : . :" in line:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
89 continue
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
90 if "__________" in line:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
91 consensus = f.readline().rstrip('\n')[seq_start:]
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
92 f.readline() # empty line
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
93 break
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
94 else:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
95 aln.append(line.rstrip('\n')[seq_start:])
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
96 return aln, consensus
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
97
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
98
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
99 def aln2coverage(aln):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
100 coverage = [0] * len(aln[0])
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
101 for a in aln:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
102 for i, c in enumerate(a):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
103 if c not in " -":
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
104 coverage[i] += 1
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
105 return coverage
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
106
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
107
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
108 def read_contig(f, seq_start):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
109 contig = ""
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
110 coverage = []
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
111 while True:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
112 aln, consensus = get_segment(f, seq_start)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
113 if aln:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
114 contig += consensus
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
115 coverage += aln2coverage(aln)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
116 else:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
117 break
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
118 return contig, coverage
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
119
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
120
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
121 def remove_gaps(consensus, coverage):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
122 if "-" not in consensus:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
123 return consensus, coverage
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
124 new_coverage = [
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
125 cov for cons, cov in zip(consensus, coverage) if cons != "-"
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
126 ]
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
127 new_consensus = consensus.replace("-", "")
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
128 return new_consensus, new_coverage
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
129
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
130
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
131 def extract_contigs_from_re_archive(archive, aln_output):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
132 with zipfile.ZipFile(archive, 'r') as zip_object, open(aln_output,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
133 'w') as fout:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
134 flist = zip_object.infolist()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
135 for fn in flist:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
136 if re.match('seqclust.+[.]aln$', fn.filename):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
137 with zip_object.open(fn.filename) as aln:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
138 for l in aln:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
139 fout.write(l.decode('utf-8'))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
140 return aln_output
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
141
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
142 def read_tarean_fasta(fobj):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
143 ids = []
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
144 s = []
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
145 for i in fobj:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
146 ii = i.decode('utf-8')
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
147 if ii[0] == ">":
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
148 ids.append(ii)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
149 s.append("")
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
150 else:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
151 s[-1] = s[-1] + ii.strip()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
152 return ids, s
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
153
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
154 def extract_tarean_contigs_from_re_archive(archive):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
155 with zipfile.ZipFile(archive, 'r') as zip_object:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
156 flist = zip_object.infolist()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
157 for fn in flist:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
158 if re.match("seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn.filename):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
159 print(fn.filename)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
160 with zip_object.open(fn.filename) as fobj:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
161 ids, seqs = read_tarean_fasta(fobj)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
162 # wrap sequences
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
163 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
164 return ids, seqs
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
165
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
166
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
167 def extract_contigs_from_re_directory(dir, aln_output):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
168 # TODO
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
169 pass
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
170
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
171
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
172 def filter_contigs(consensus, coverage, min_coverage=5):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
173 x = "".join([
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
174 s if cov >= min_coverage else " "
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
175 for s, cov in zip(consensus, coverage)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
176 ]).strip()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
177 consensus_N = "\n".join(textwrap.wrap(x.replace(" ", "N"),80))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
178 return consensus_N
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
179
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
180
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
181 def main():
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
182 args = parse_args()
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
183 # extract aln from archive
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
184 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
185 aln_file = extract_contigs_from_re_archive(
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
186 args.re_file,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
187 tempfile.NamedTemporaryFile().name)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
188 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
189 while True:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
190 contig_name, seq_start = get_header(f1)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
191 if contig_name:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
192 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
193 clean_consensus = filter_contigs(consensus, coverage,
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
194 args.min_coverage)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
195 if len(clean_consensus) >= args.min_contig_length:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
196 ffasta.write(">{}\n".format(contig_name))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
197 ffasta.write("{}\n".format(clean_consensus))
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
198 else:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
199 break
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
200
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
201 # write tarean sequences:
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
202 for i, s in zip(ids, seqs):
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
203 ffasta.write(i)
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
204 ffasta.write(s + "\n")
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
205
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
206
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
207
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
208 if __name__ == "__main__":
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
209
cf3cea0a3039 Uploaded
petr-novak
parents:
diff changeset
210 main()