Mercurial > repos > cpt > cpt_xmfa_to_table
comparison cpt_convert_xmfa/xmfa.py @ 0:06d8e28d0bd7 draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 10 Jun 2022 08:49:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:06d8e28d0bd7 |
---|---|
1 # This file is licensed seperately of the rest of the codebase. This is due to | |
2 # BioPython's failure to merge https://github.com/biopython/biopython/pull/544 | |
3 # in a timely fashion. Please use this file however you see fit! | |
4 # | |
5 # | |
6 # Copyright (c) 2015-2017 Center for Phage Technology. All rights reserved. | |
7 # Redistribution and use in source and binary forms, with or without | |
8 # modification, are permitted provided that the following conditions are met: | |
9 # | |
10 # 1. Redistributions of source code must retain the above copyright notice, this | |
11 # list of conditions and the following disclaimer. | |
12 # | |
13 # 2. Redistributions in binary form must reproduce the above copyright notice, | |
14 # this list of conditions and the following disclaimer in the documentation | |
15 # and/or other materials provided with the distribution. | |
16 # | |
17 # THIS SOFTWARE IS PROVIDED BY CENTER FOR PHAGE TECHNOLOGY "AS IS" AND ANY | |
18 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
19 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
20 # DISCLAIMED. IN NO EVENT SHALL CENTER FOR PHAGE TECHNOLOGY OR CONTRIBUTORS BE | |
21 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
22 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |
23 # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
26 # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
27 from Bio import SeqIO | |
28 import tempfile | |
29 import sys | |
30 | |
31 | |
32 def parse_xmfa(xmfa): | |
33 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 | |
34 """ | |
35 current_lcb = [] | |
36 current_seq = {} | |
37 for line in xmfa.readlines(): | |
38 if line.startswith("#"): | |
39 continue | |
40 | |
41 if line.strip() == "=": | |
42 if "id" in current_seq: | |
43 current_lcb.append(current_seq) | |
44 current_seq = {} | |
45 yield current_lcb | |
46 current_lcb = [] | |
47 else: | |
48 line = line.strip() | |
49 if line.startswith(">"): | |
50 if "id" in current_seq: | |
51 current_lcb.append(current_seq) | |
52 current_seq = {} | |
53 data = line.strip().split() | |
54 # 0 1 2 3 4 5 | |
55 # > 1:5986-6406 + CbK.fa # CbK_gp011 | |
56 id, loc = data[1].split(":") | |
57 start, end = loc.split("-") | |
58 current_seq = { | |
59 "rid": "_".join(data[1:]), | |
60 "id": id, | |
61 "start": int(start), | |
62 "end": int(end), | |
63 "strand": 1 if data[2] == "+" else -1, | |
64 "file": data[3], | |
65 "seq": "", | |
66 "comment": "", | |
67 } | |
68 if len(data) > 5: | |
69 current_seq["comment"] = " ".join(data[5:]) | |
70 else: | |
71 current_seq["seq"] += line.strip() | |
72 | |
73 | |
74 HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n" | |
75 | |
76 | |
77 def split_by_n(seq, n): | |
78 """A generator to divide a sequence into chunks of n units.""" | |
79 # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character | |
80 while seq: | |
81 yield seq[:n] | |
82 seq = seq[n:] | |
83 | |
84 | |
85 def to_xmfa(lcbs, handle=sys.stdout): | |
86 handle.write("#FormatVersion Mauve1\n") | |
87 for lcb in lcbs: | |
88 for aln in lcb: | |
89 handle.write( | |
90 HEADER_TPL.format( | |
91 id=aln["id"], | |
92 start=aln["start"], | |
93 end=aln["end"], | |
94 strand="+" if aln["strand"] > 0 else "-", | |
95 file=aln["file"], | |
96 comment=aln["comment"], | |
97 ) | |
98 ) | |
99 | |
100 for line in split_by_n(aln["seq"], 80): | |
101 handle.write(line + "\n") | |
102 handle.write("=\n") | |
103 | |
104 | |
105 def percent_identity(a, b): | |
106 """Calculate % identity, ignoring gaps in the host sequence | |
107 """ | |
108 match = 0 | |
109 mismatch = 0 | |
110 for char_a, char_b in zip(list(a), list(b)): | |
111 if char_a == "-": | |
112 continue | |
113 if char_a == char_b: | |
114 match += 1 | |
115 else: | |
116 mismatch += 1 | |
117 | |
118 if match + mismatch == 0: | |
119 return 0.0 | |
120 return 100 * float(match) / (match + mismatch) | |
121 | |
122 | |
123 def id_tn_dict(sequences, tmpfile=False): | |
124 """Figure out sequence IDs | |
125 """ | |
126 label_convert = {} | |
127 correct_chrom = None | |
128 if not isinstance(sequences, list): | |
129 sequences = [sequences] | |
130 | |
131 i = 0 | |
132 for sequence_file in sequences: | |
133 for record in SeqIO.parse(sequence_file, "fasta"): | |
134 if correct_chrom is None: | |
135 correct_chrom = record.id | |
136 | |
137 i += 1 | |
138 key = str(i) | |
139 label_convert[key] = {"record_id": record.id, "len": len(record.seq)} | |
140 | |
141 if tmpfile: | |
142 label_convert[key] = tempfile.NamedTemporaryFile(delete=False) | |
143 | |
144 return label_convert |