comparison cpt_convert_xmfa/xmfa.py @ 0:06d8e28d0bd7 draft default tip

Uploaded
author cpt
date Fri, 10 Jun 2022 08:49:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:06d8e28d0bd7
1 # This file is licensed seperately of the rest of the codebase. This is due to
2 # BioPython's failure to merge https://github.com/biopython/biopython/pull/544
3 # in a timely fashion. Please use this file however you see fit!
4 #
5 #
6 # Copyright (c) 2015-2017 Center for Phage Technology. All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are met:
9 #
10 # 1. Redistributions of source code must retain the above copyright notice, this
11 # list of conditions and the following disclaimer.
12 #
13 # 2. Redistributions in binary form must reproduce the above copyright notice,
14 # this list of conditions and the following disclaimer in the documentation
15 # and/or other materials provided with the distribution.
16 #
17 # THIS SOFTWARE IS PROVIDED BY CENTER FOR PHAGE TECHNOLOGY "AS IS" AND ANY
18 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 # DISCLAIMED. IN NO EVENT SHALL CENTER FOR PHAGE TECHNOLOGY OR CONTRIBUTORS BE
21 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 from Bio import SeqIO
28 import tempfile
29 import sys
30
31
32 def parse_xmfa(xmfa):
33 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
34 """
35 current_lcb = []
36 current_seq = {}
37 for line in xmfa.readlines():
38 if line.startswith("#"):
39 continue
40
41 if line.strip() == "=":
42 if "id" in current_seq:
43 current_lcb.append(current_seq)
44 current_seq = {}
45 yield current_lcb
46 current_lcb = []
47 else:
48 line = line.strip()
49 if line.startswith(">"):
50 if "id" in current_seq:
51 current_lcb.append(current_seq)
52 current_seq = {}
53 data = line.strip().split()
54 # 0 1 2 3 4 5
55 # > 1:5986-6406 + CbK.fa # CbK_gp011
56 id, loc = data[1].split(":")
57 start, end = loc.split("-")
58 current_seq = {
59 "rid": "_".join(data[1:]),
60 "id": id,
61 "start": int(start),
62 "end": int(end),
63 "strand": 1 if data[2] == "+" else -1,
64 "file": data[3],
65 "seq": "",
66 "comment": "",
67 }
68 if len(data) > 5:
69 current_seq["comment"] = " ".join(data[5:])
70 else:
71 current_seq["seq"] += line.strip()
72
73
74 HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
75
76
77 def split_by_n(seq, n):
78 """A generator to divide a sequence into chunks of n units."""
79 # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
80 while seq:
81 yield seq[:n]
82 seq = seq[n:]
83
84
85 def to_xmfa(lcbs, handle=sys.stdout):
86 handle.write("#FormatVersion Mauve1\n")
87 for lcb in lcbs:
88 for aln in lcb:
89 handle.write(
90 HEADER_TPL.format(
91 id=aln["id"],
92 start=aln["start"],
93 end=aln["end"],
94 strand="+" if aln["strand"] > 0 else "-",
95 file=aln["file"],
96 comment=aln["comment"],
97 )
98 )
99
100 for line in split_by_n(aln["seq"], 80):
101 handle.write(line + "\n")
102 handle.write("=\n")
103
104
105 def percent_identity(a, b):
106 """Calculate % identity, ignoring gaps in the host sequence
107 """
108 match = 0
109 mismatch = 0
110 for char_a, char_b in zip(list(a), list(b)):
111 if char_a == "-":
112 continue
113 if char_a == char_b:
114 match += 1
115 else:
116 mismatch += 1
117
118 if match + mismatch == 0:
119 return 0.0
120 return 100 * float(match) / (match + mismatch)
121
122
123 def id_tn_dict(sequences, tmpfile=False):
124 """Figure out sequence IDs
125 """
126 label_convert = {}
127 correct_chrom = None
128 if not isinstance(sequences, list):
129 sequences = [sequences]
130
131 i = 0
132 for sequence_file in sequences:
133 for record in SeqIO.parse(sequence_file, "fasta"):
134 if correct_chrom is None:
135 correct_chrom = record.id
136
137 i += 1
138 key = str(i)
139 label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
140
141 if tmpfile:
142 label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
143
144 return label_convert