0
|
1 # This file is licensed seperately of the rest of the codebase. This is due to
|
|
2 # BioPython's failure to merge https://github.com/biopython/biopython/pull/544
|
|
3 # in a timely fashion. Please use this file however you see fit!
|
|
4 #
|
|
5 #
|
|
6 # Copyright (c) 2015-2017 Center for Phage Technology. All rights reserved.
|
|
7 # Redistribution and use in source and binary forms, with or without
|
|
8 # modification, are permitted provided that the following conditions are met:
|
|
9 #
|
|
10 # 1. Redistributions of source code must retain the above copyright notice, this
|
|
11 # list of conditions and the following disclaimer.
|
|
12 #
|
|
13 # 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
14 # this list of conditions and the following disclaimer in the documentation
|
|
15 # and/or other materials provided with the distribution.
|
|
16 #
|
|
17 # THIS SOFTWARE IS PROVIDED BY CENTER FOR PHAGE TECHNOLOGY "AS IS" AND ANY
|
|
18 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
19 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
20 # DISCLAIMED. IN NO EVENT SHALL CENTER FOR PHAGE TECHNOLOGY OR CONTRIBUTORS BE
|
|
21 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
22 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
23 # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
|
26 # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
27 from Bio import SeqIO
|
|
28 import tempfile
|
|
29 import sys
|
|
30
|
|
31
|
|
32 def parse_xmfa(xmfa):
|
|
33 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
|
|
34 """
|
|
35 current_lcb = []
|
|
36 current_seq = {}
|
|
37 for line in xmfa.readlines():
|
|
38 if line.startswith("#"):
|
|
39 continue
|
|
40
|
|
41 if line.strip() == "=":
|
|
42 if "id" in current_seq:
|
|
43 current_lcb.append(current_seq)
|
|
44 current_seq = {}
|
|
45 yield current_lcb
|
|
46 current_lcb = []
|
|
47 else:
|
|
48 line = line.strip()
|
|
49 if line.startswith(">"):
|
|
50 if "id" in current_seq:
|
|
51 current_lcb.append(current_seq)
|
|
52 current_seq = {}
|
|
53 data = line.strip().split()
|
|
54 # 0 1 2 3 4 5
|
|
55 # > 1:5986-6406 + CbK.fa # CbK_gp011
|
|
56 id, loc = data[1].split(":")
|
|
57 start, end = loc.split("-")
|
|
58 current_seq = {
|
|
59 "rid": "_".join(data[1:]),
|
|
60 "id": id,
|
|
61 "start": int(start),
|
|
62 "end": int(end),
|
|
63 "strand": 1 if data[2] == "+" else -1,
|
|
64 "file": data[3],
|
|
65 "seq": "",
|
|
66 "comment": "",
|
|
67 }
|
|
68 if len(data) > 5:
|
|
69 current_seq["comment"] = " ".join(data[5:])
|
|
70 else:
|
|
71 current_seq["seq"] += line.strip()
|
|
72
|
|
73
|
|
74 HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
|
|
75
|
|
76
|
|
77 def split_by_n(seq, n):
|
|
78 """A generator to divide a sequence into chunks of n units."""
|
|
79 # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
|
|
80 while seq:
|
|
81 yield seq[:n]
|
|
82 seq = seq[n:]
|
|
83
|
|
84
|
|
85 def to_xmfa(lcbs, handle=sys.stdout):
|
|
86 handle.write("#FormatVersion Mauve1\n")
|
|
87 for lcb in lcbs:
|
|
88 for aln in lcb:
|
|
89 handle.write(
|
|
90 HEADER_TPL.format(
|
|
91 id=aln["id"],
|
|
92 start=aln["start"],
|
|
93 end=aln["end"],
|
|
94 strand="+" if aln["strand"] > 0 else "-",
|
|
95 file=aln["file"],
|
|
96 comment=aln["comment"],
|
|
97 )
|
|
98 )
|
|
99
|
|
100 for line in split_by_n(aln["seq"], 80):
|
|
101 handle.write(line + "\n")
|
|
102 handle.write("=\n")
|
|
103
|
|
104
|
|
105 def percent_identity(a, b):
|
|
106 """Calculate % identity, ignoring gaps in the host sequence
|
|
107 """
|
|
108 match = 0
|
|
109 mismatch = 0
|
|
110 for char_a, char_b in zip(list(a), list(b)):
|
|
111 if char_a == "-":
|
|
112 continue
|
|
113 if char_a == char_b:
|
|
114 match += 1
|
|
115 else:
|
|
116 mismatch += 1
|
|
117
|
|
118 if match + mismatch == 0:
|
|
119 return 0.0
|
|
120 return 100 * float(match) / (match + mismatch)
|
|
121
|
|
122
|
|
123 def id_tn_dict(sequences, tmpfile=False):
|
|
124 """Figure out sequence IDs
|
|
125 """
|
|
126 label_convert = {}
|
|
127 correct_chrom = None
|
|
128 if not isinstance(sequences, list):
|
|
129 sequences = [sequences]
|
|
130
|
|
131 i = 0
|
|
132 for sequence_file in sequences:
|
|
133 for record in SeqIO.parse(sequence_file, "fasta"):
|
|
134 if correct_chrom is None:
|
|
135 correct_chrom = record.id
|
|
136
|
|
137 i += 1
|
|
138 key = str(i)
|
|
139 label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
|
|
140
|
|
141 if tmpfile:
|
|
142 label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
|
|
143
|
|
144 return label_convert
|