Mercurial > repos > petr-novak > repeat_annotation_pipeline3
comparison get_contigs_from_re_archive.py @ 1:814cba36e435 draft
Uploaded
| author | mvdbeek |
|---|---|
| date | Mon, 21 Feb 2022 10:21:39 +0000 |
| parents | ea6a3059a6af |
| children |
comparison
equal
deleted
inserted
replaced
| 0:ea6a3059a6af | 1:814cba36e435 |
|---|---|
| 6 import argparse | 6 import argparse |
| 7 import re | 7 import re |
| 8 import zipfile | 8 import zipfile |
| 9 import tempfile | 9 import tempfile |
| 10 import textwrap | 10 import textwrap |
| 11 import os | |
| 11 | 12 |
| 12 def parse_args(): | 13 def parse_args(): |
| 13 '''Argument parsin''' | 14 '''Argument parsin''' |
| 14 description = """ | 15 description = """ |
| 15 parsing cap3 assembly aln output | 16 parsing cap3 assembly aln output |
| 141 | 142 |
| 142 def read_tarean_fasta(fobj): | 143 def read_tarean_fasta(fobj): |
| 143 ids = [] | 144 ids = [] |
| 144 s = [] | 145 s = [] |
| 145 for i in fobj: | 146 for i in fobj: |
| 146 ii = i.decode('utf-8') | 147 if isinstance(i, str): |
| 148 ii = i | |
| 149 else: | |
| 150 ii = i.decode('utf-8') | |
| 147 if ii[0] == ">": | 151 if ii[0] == ">": |
| 148 ids.append(ii) | 152 ids.append(ii) |
| 149 s.append("") | 153 s.append("") |
| 150 else: | 154 else: |
| 151 s[-1] = s[-1] + ii.strip() | 155 s[-1] = s[-1] + ii.strip() |
| 152 return ids, s | 156 return ids, s |
| 157 | |
| 158 | |
| 159 def extract_contigs_from_re_directory(re_dir, aln_output): | |
| 160 with open(aln_output, 'w') as fout: | |
| 161 for subdir, dirs, files in os.walk(re_dir): | |
| 162 for fn in files: | |
| 163 fn_full = subdir + os.sep + fn | |
| 164 if re.match('^.+seqclust.+[.]aln$', fn_full): | |
| 165 print(fn_full) | |
| 166 with open(fn_full) as aln: | |
| 167 for l in aln: | |
| 168 fout.write(l) | |
| 169 return aln_output | |
| 170 | |
| 171 | |
| 153 | 172 |
| 154 def extract_tarean_contigs_from_re_archive(archive): | 173 def extract_tarean_contigs_from_re_archive(archive): |
| 155 with zipfile.ZipFile(archive, 'r') as zip_object: | 174 with zipfile.ZipFile(archive, 'r') as zip_object: |
| 156 flist = zip_object.infolist() | 175 flist = zip_object.infolist() |
| 157 seqs_all = [] | 176 seqs_all = [] |
| 166 seqs_all += seqs | 185 seqs_all += seqs |
| 167 ids_all += ids | 186 ids_all += ids |
| 168 return ids_all, seqs_all | 187 return ids_all, seqs_all |
| 169 | 188 |
| 170 | 189 |
| 171 def extract_contigs_from_re_directory(dir, aln_output): | 190 def extract_tarean_contigs_from_re_directory(re_dir): |
| 172 # TODO | 191 seqs_all = [] |
| 173 pass | 192 ids_all = [] |
| 174 | 193 for subdir, dirs, files in os.walk(re_dir): |
| 194 for fn in files: | |
| 195 fn_full = subdir + os.sep + fn | |
| 196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full): | |
| 197 print (fn_full) | |
| 198 with open(fn_full) as fobj: | |
| 199 ids, seqs = read_tarean_fasta(fobj) | |
| 200 # wrap sequences | |
| 201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] | |
| 202 seqs_all += seqs | |
| 203 ids_all += ids | |
| 204 return ids_all, seqs_all | |
| 175 | 205 |
| 176 def filter_contigs(consensus, coverage, min_coverage=5): | 206 def filter_contigs(consensus, coverage, min_coverage=5): |
| 177 x = "".join([ | 207 x = "".join([ |
| 178 s if cov >= min_coverage else " " | 208 s if cov >= min_coverage else " " |
| 179 for s, cov in zip(consensus, coverage) | 209 for s, cov in zip(consensus, coverage) |
| 182 return consensus_N | 212 return consensus_N |
| 183 | 213 |
| 184 | 214 |
| 185 def main(): | 215 def main(): |
| 186 args = parse_args() | 216 args = parse_args() |
| 187 # extract aln from archive | 217 if os.path.isdir(args.re_file): |
| 188 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) | 218 # extract from directory |
| 189 aln_file = extract_contigs_from_re_archive( | 219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file) |
| 190 args.re_file, | 220 aln_file = extract_contigs_from_re_directory( |
| 191 tempfile.NamedTemporaryFile().name) | 221 args.re_file, |
| 222 tempfile.NamedTemporaryFile().name) | |
| 223 else: | |
| 224 # extract aln from archive | |
| 225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) | |
| 226 aln_file = extract_contigs_from_re_archive( | |
| 227 args.re_file, | |
| 228 tempfile.NamedTemporaryFile().name) | |
| 229 | |
| 192 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: | 230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: |
| 193 while True: | 231 while True: |
| 194 contig_name, seq_start = get_header(f1) | 232 contig_name, seq_start = get_header(f1) |
| 195 if contig_name: | 233 if contig_name: |
| 196 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) | 234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) |
