comparison get_contigs_from_re_archive.py @ 1:814cba36e435 draft

Uploaded
author mvdbeek
date Mon, 21 Feb 2022 10:21:39 +0000
parents ea6a3059a6af
children
comparison
equal deleted inserted replaced
0:ea6a3059a6af 1:814cba36e435
6 import argparse 6 import argparse
7 import re 7 import re
8 import zipfile 8 import zipfile
9 import tempfile 9 import tempfile
10 import textwrap 10 import textwrap
11 import os
11 12
12 def parse_args(): 13 def parse_args():
13 '''Argument parsin''' 14 '''Argument parsin'''
14 description = """ 15 description = """
15 parsing cap3 assembly aln output 16 parsing cap3 assembly aln output
141 142
142 def read_tarean_fasta(fobj): 143 def read_tarean_fasta(fobj):
143 ids = [] 144 ids = []
144 s = [] 145 s = []
145 for i in fobj: 146 for i in fobj:
146 ii = i.decode('utf-8') 147 if isinstance(i, str):
148 ii = i
149 else:
150 ii = i.decode('utf-8')
147 if ii[0] == ">": 151 if ii[0] == ">":
148 ids.append(ii) 152 ids.append(ii)
149 s.append("") 153 s.append("")
150 else: 154 else:
151 s[-1] = s[-1] + ii.strip() 155 s[-1] = s[-1] + ii.strip()
152 return ids, s 156 return ids, s
157
158
159 def extract_contigs_from_re_directory(re_dir, aln_output):
160 with open(aln_output, 'w') as fout:
161 for subdir, dirs, files in os.walk(re_dir):
162 for fn in files:
163 fn_full = subdir + os.sep + fn
164 if re.match('^.+seqclust.+[.]aln$', fn_full):
165 print(fn_full)
166 with open(fn_full) as aln:
167 for l in aln:
168 fout.write(l)
169 return aln_output
170
171
153 172
154 def extract_tarean_contigs_from_re_archive(archive): 173 def extract_tarean_contigs_from_re_archive(archive):
155 with zipfile.ZipFile(archive, 'r') as zip_object: 174 with zipfile.ZipFile(archive, 'r') as zip_object:
156 flist = zip_object.infolist() 175 flist = zip_object.infolist()
157 seqs_all = [] 176 seqs_all = []
166 seqs_all += seqs 185 seqs_all += seqs
167 ids_all += ids 186 ids_all += ids
168 return ids_all, seqs_all 187 return ids_all, seqs_all
169 188
170 189
171 def extract_contigs_from_re_directory(dir, aln_output): 190 def extract_tarean_contigs_from_re_directory(re_dir):
172 # TODO 191 seqs_all = []
173 pass 192 ids_all = []
174 193 for subdir, dirs, files in os.walk(re_dir):
194 for fn in files:
195 fn_full = subdir + os.sep + fn
196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full):
197 print (fn_full)
198 with open(fn_full) as fobj:
199 ids, seqs = read_tarean_fasta(fobj)
200 # wrap sequences
201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
202 seqs_all += seqs
203 ids_all += ids
204 return ids_all, seqs_all
175 205
176 def filter_contigs(consensus, coverage, min_coverage=5): 206 def filter_contigs(consensus, coverage, min_coverage=5):
177 x = "".join([ 207 x = "".join([
178 s if cov >= min_coverage else " " 208 s if cov >= min_coverage else " "
179 for s, cov in zip(consensus, coverage) 209 for s, cov in zip(consensus, coverage)
182 return consensus_N 212 return consensus_N
183 213
184 214
185 def main(): 215 def main():
186 args = parse_args() 216 args = parse_args()
187 # extract aln from archive 217 if os.path.isdir(args.re_file):
188 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) 218 # extract from directory
189 aln_file = extract_contigs_from_re_archive( 219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file)
190 args.re_file, 220 aln_file = extract_contigs_from_re_directory(
191 tempfile.NamedTemporaryFile().name) 221 args.re_file,
222 tempfile.NamedTemporaryFile().name)
223 else:
224 # extract aln from archive
225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file)
226 aln_file = extract_contigs_from_re_archive(
227 args.re_file,
228 tempfile.NamedTemporaryFile().name)
229
192 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: 230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta:
193 while True: 231 while True:
194 contig_name, seq_start = get_header(f1) 232 contig_name, seq_start = get_header(f1)
195 if contig_name: 233 if contig_name:
196 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) 234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))