Mercurial > repos > petr-novak > repeat_annotation_pipeline3
comparison get_contigs_from_re_archive.py @ 1:814cba36e435 draft
Uploaded
author | mvdbeek |
---|---|
date | Mon, 21 Feb 2022 10:21:39 +0000 |
parents | ea6a3059a6af |
children |
comparison
equal
deleted
inserted
replaced
0:ea6a3059a6af | 1:814cba36e435 |
---|---|
6 import argparse | 6 import argparse |
7 import re | 7 import re |
8 import zipfile | 8 import zipfile |
9 import tempfile | 9 import tempfile |
10 import textwrap | 10 import textwrap |
11 import os | |
11 | 12 |
12 def parse_args(): | 13 def parse_args(): |
13 '''Argument parsin''' | 14 '''Argument parsin''' |
14 description = """ | 15 description = """ |
15 parsing cap3 assembly aln output | 16 parsing cap3 assembly aln output |
141 | 142 |
142 def read_tarean_fasta(fobj): | 143 def read_tarean_fasta(fobj): |
143 ids = [] | 144 ids = [] |
144 s = [] | 145 s = [] |
145 for i in fobj: | 146 for i in fobj: |
146 ii = i.decode('utf-8') | 147 if isinstance(i, str): |
148 ii = i | |
149 else: | |
150 ii = i.decode('utf-8') | |
147 if ii[0] == ">": | 151 if ii[0] == ">": |
148 ids.append(ii) | 152 ids.append(ii) |
149 s.append("") | 153 s.append("") |
150 else: | 154 else: |
151 s[-1] = s[-1] + ii.strip() | 155 s[-1] = s[-1] + ii.strip() |
152 return ids, s | 156 return ids, s |
157 | |
158 | |
159 def extract_contigs_from_re_directory(re_dir, aln_output): | |
160 with open(aln_output, 'w') as fout: | |
161 for subdir, dirs, files in os.walk(re_dir): | |
162 for fn in files: | |
163 fn_full = subdir + os.sep + fn | |
164 if re.match('^.+seqclust.+[.]aln$', fn_full): | |
165 print(fn_full) | |
166 with open(fn_full) as aln: | |
167 for l in aln: | |
168 fout.write(l) | |
169 return aln_output | |
170 | |
171 | |
153 | 172 |
154 def extract_tarean_contigs_from_re_archive(archive): | 173 def extract_tarean_contigs_from_re_archive(archive): |
155 with zipfile.ZipFile(archive, 'r') as zip_object: | 174 with zipfile.ZipFile(archive, 'r') as zip_object: |
156 flist = zip_object.infolist() | 175 flist = zip_object.infolist() |
157 seqs_all = [] | 176 seqs_all = [] |
166 seqs_all += seqs | 185 seqs_all += seqs |
167 ids_all += ids | 186 ids_all += ids |
168 return ids_all, seqs_all | 187 return ids_all, seqs_all |
169 | 188 |
170 | 189 |
171 def extract_contigs_from_re_directory(dir, aln_output): | 190 def extract_tarean_contigs_from_re_directory(re_dir): |
172 # TODO | 191 seqs_all = [] |
173 pass | 192 ids_all = [] |
174 | 193 for subdir, dirs, files in os.walk(re_dir): |
194 for fn in files: | |
195 fn_full = subdir + os.sep + fn | |
196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full): | |
197 print (fn_full) | |
198 with open(fn_full) as fobj: | |
199 ids, seqs = read_tarean_fasta(fobj) | |
200 # wrap sequences | |
201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] | |
202 seqs_all += seqs | |
203 ids_all += ids | |
204 return ids_all, seqs_all | |
175 | 205 |
176 def filter_contigs(consensus, coverage, min_coverage=5): | 206 def filter_contigs(consensus, coverage, min_coverage=5): |
177 x = "".join([ | 207 x = "".join([ |
178 s if cov >= min_coverage else " " | 208 s if cov >= min_coverage else " " |
179 for s, cov in zip(consensus, coverage) | 209 for s, cov in zip(consensus, coverage) |
182 return consensus_N | 212 return consensus_N |
183 | 213 |
184 | 214 |
185 def main(): | 215 def main(): |
186 args = parse_args() | 216 args = parse_args() |
187 # extract aln from archive | 217 if os.path.isdir(args.re_file): |
188 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) | 218 # extract from directory |
189 aln_file = extract_contigs_from_re_archive( | 219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file) |
190 args.re_file, | 220 aln_file = extract_contigs_from_re_directory( |
191 tempfile.NamedTemporaryFile().name) | 221 args.re_file, |
222 tempfile.NamedTemporaryFile().name) | |
223 else: | |
224 # extract aln from archive | |
225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) | |
226 aln_file = extract_contigs_from_re_archive( | |
227 args.re_file, | |
228 tempfile.NamedTemporaryFile().name) | |
229 | |
192 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: | 230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: |
193 while True: | 231 while True: |
194 contig_name, seq_start = get_header(f1) | 232 contig_name, seq_start = get_header(f1) |
195 if contig_name: | 233 if contig_name: |
196 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) | 234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) |