# HG changeset patch
# User cpt
# Date 1654850983 0
# Node ID 06d8e28d0bd7d77fb32cf6d99d9da50aaadb97eb
Uploaded
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/cpt-macros.xml Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/macros.xml Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,58 @@
+
+
+
+
+ progressivemauve
+ python
+ biopython
+ cpt_gffparser
+
+
+
+ 2.4.0
+
+ 10.1371/journal.pone.0011147
+
+
+ 10.1093/bioinformatics/btm039
+
+
+
+ "$xmfa"
+
+
+
+
+
+
+ "$sequences"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ "$gff3_data"
+
+
+ genomeref.fa
+
+
+ ln -s $genome_fasta genomeref.fa;
+
+
+ genomeref.fa
+
+
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/test-data/test.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/test.fa Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,6 @@
+>A [Orig=NODE_1_length_60_cov_149.246_ID_1]
+GGGGGGGGGGAACAGATAGATTTTTAGATAGACAGATAGAAATGAATGAATGAATGAATG
+>B [Orig=NODE_1_length_38_cov_140.805_ID_1]
+GGGGGGGGGGGGGGGGGGGGAATGAATGAAAATGAATG
+>C [Orig=NODE_1_length_25_cov_144.849_ID_1]
+TTTTTGGGGGGAAGGCCCCCTTGGT
\ No newline at end of file
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/test-data/test.xmfa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/test.xmfa Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,28 @@
+#FormatVersion Mauve1
+#Sequence1File test.fa
+#Sequence1Entry 1
+#Sequence1Format FastA
+#Sequence2File test.fa
+#Sequence2Entry 2
+#Sequence2Format FastA
+#Sequence3File test.fa
+#Sequence3Entry 3
+#Sequence3Format FastA
+#BackboneFile test.xmfa.bbcols
+> 1:1-10 + test.fa
+-----GGGGGGGGGG
+> 2:1-10 + test.fa
+-----GGGGGGGGGG
+> 3:6-15 + test.fa
+-----GGGGGGAAGG
+=
+> 1:41-60 + test.fa
+AATGAATGAATGAATGAATG
+> 2:21-38 + test.fa
+AATGAATGAA--AATGAATG
+=
+> 1:21-25 + test.fa
+TTTTT
+> 3:21-25 + test.fa
+TTGGT
+=
\ No newline at end of file
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/test-data/xmfa2tbl_out.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/xmfa2tbl_out.tsv Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,4 @@
+ A B C
+A 100.00 73.68 44.00
+B 46.67 100.00 32.00
+C 18.33 21.05 100.00
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/test-data/xmfa2tbl_out_dice.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/xmfa2tbl_out_dice.tsv Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,4 @@
+ A B C
+A 100.00 57.14 25.88
+B 57.14 100.00 25.40
+C 25.88 25.40 100.00
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/xmfa.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/xmfa.py Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,144 @@
+# This file is licensed seperately of the rest of the codebase. This is due to
+# BioPython's failure to merge https://github.com/biopython/biopython/pull/544
+# in a timely fashion. Please use this file however you see fit!
+#
+#
+# Copyright (c) 2015-2017 Center for Phage Technology. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY CENTER FOR PHAGE TECHNOLOGY "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL CENTER FOR PHAGE TECHNOLOGY OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from Bio import SeqIO
+import tempfile
+import sys
+
+
+def parse_xmfa(xmfa):
+ """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
+ """
+ current_lcb = []
+ current_seq = {}
+ for line in xmfa.readlines():
+ if line.startswith("#"):
+ continue
+
+ if line.strip() == "=":
+ if "id" in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ yield current_lcb
+ current_lcb = []
+ else:
+ line = line.strip()
+ if line.startswith(">"):
+ if "id" in current_seq:
+ current_lcb.append(current_seq)
+ current_seq = {}
+ data = line.strip().split()
+ # 0 1 2 3 4 5
+ # > 1:5986-6406 + CbK.fa # CbK_gp011
+ id, loc = data[1].split(":")
+ start, end = loc.split("-")
+ current_seq = {
+ "rid": "_".join(data[1:]),
+ "id": id,
+ "start": int(start),
+ "end": int(end),
+ "strand": 1 if data[2] == "+" else -1,
+ "file": data[3],
+ "seq": "",
+ "comment": "",
+ }
+ if len(data) > 5:
+ current_seq["comment"] = " ".join(data[5:])
+ else:
+ current_seq["seq"] += line.strip()
+
+
+HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
+
+
+def split_by_n(seq, n):
+ """A generator to divide a sequence into chunks of n units."""
+ # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
+ while seq:
+ yield seq[:n]
+ seq = seq[n:]
+
+
+def to_xmfa(lcbs, handle=sys.stdout):
+ handle.write("#FormatVersion Mauve1\n")
+ for lcb in lcbs:
+ for aln in lcb:
+ handle.write(
+ HEADER_TPL.format(
+ id=aln["id"],
+ start=aln["start"],
+ end=aln["end"],
+ strand="+" if aln["strand"] > 0 else "-",
+ file=aln["file"],
+ comment=aln["comment"],
+ )
+ )
+
+ for line in split_by_n(aln["seq"], 80):
+ handle.write(line + "\n")
+ handle.write("=\n")
+
+
+def percent_identity(a, b):
+ """Calculate % identity, ignoring gaps in the host sequence
+ """
+ match = 0
+ mismatch = 0
+ for char_a, char_b in zip(list(a), list(b)):
+ if char_a == "-":
+ continue
+ if char_a == char_b:
+ match += 1
+ else:
+ mismatch += 1
+
+ if match + mismatch == 0:
+ return 0.0
+ return 100 * float(match) / (match + mismatch)
+
+
+def id_tn_dict(sequences, tmpfile=False):
+ """Figure out sequence IDs
+ """
+ label_convert = {}
+ correct_chrom = None
+ if not isinstance(sequences, list):
+ sequences = [sequences]
+
+ i = 0
+ for sequence_file in sequences:
+ for record in SeqIO.parse(sequence_file, "fasta"):
+ if correct_chrom is None:
+ correct_chrom = record.id
+
+ i += 1
+ key = str(i)
+ label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
+
+ if tmpfile:
+ label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
+
+ return label_convert
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/xmfa2tbl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/xmfa2tbl.py Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import sys
+from xmfa import parse_xmfa, percent_identity
+import argparse
+import logging
+import itertools
+from collections import Counter
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def _id_tn_dict(sequences):
+ """Figure out sequence IDs AND sequence lengths from fasta file
+ """
+ label_convert = {}
+ if sequences is not None:
+ if len(sequences) == 1:
+ for i, record in enumerate(SeqIO.parse(sequences[0], "fasta")):
+ label_convert[str(i + 1)] = {"id": record.id, "len": len(record)}
+ else:
+ for i, sequence in enumerate(sequences):
+ for record in SeqIO.parse(sequence, "fasta"):
+ label_convert[str(i + 1)] = {"id": record.id, "len": len(record)}
+ continue
+ # check for repeated sequence IDs
+ id_dupes = Counter(list(x["id"] for x in label_convert.values()))
+ for dupe in id_dupes:
+ if id_dupes[dupe] > 1:
+ log.debug("Duplicate FASTA ID Found: %s\n" % (dupe))
+ for xmfaid in label_convert.keys():
+ # Change the duplicate labels to have the XMFA identifer
+ # as a prefix so not to break the table generation later
+ if label_convert[xmfaid]["id"] == dupe:
+ label_convert[xmfaid]["id"] = "%s_%s" % (xmfaid, dupe)
+
+ return label_convert
+
+
+def total_similarity(xmfa_file, sequences=None, dice=False):
+ if sequences is None:
+ raise Exception("Must provide a non-zero number of sequence files")
+
+ label_convert = _id_tn_dict(sequences)
+ lcbs = parse_xmfa(xmfa_file)
+
+ # make a matrix based on number of sequences
+ table = {}
+
+ for lcb in lcbs:
+ # ignore LCBs containing only one sequence
+ if len(lcb) == 0:
+ continue
+
+ # permutations based on num sequences to compare for current LCB
+ compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2))
+ for permutation in compare_seqs:
+ (i, j) = permutation
+ similarity = percent_identity(lcb[i]["seq"], lcb[j]["seq"])
+
+ i_name = label_convert[lcb[i]["id"]]["id"]
+ j_name = label_convert[lcb[j]["id"]]["id"]
+ # find length of sequence in LCB
+ length_seq_lcb = lcb[i]["end"] - (lcb[i]["start"] - 1)
+ # populate table with normalized similarity value based on length_seq_lcb
+ if (i_name, j_name) not in table:
+ table[(i_name, j_name)] = 0
+ table[(i_name, j_name)] += length_seq_lcb * similarity
+
+ # finalize total percent similarity by dividing by length of parent sequence
+ for i in label_convert.keys():
+ for j in label_convert.keys():
+ i_name = label_convert[i]["id"]
+ j_name = label_convert[j]["id"]
+ if (i_name, j_name) in table:
+ if dice:
+ table[(i_name, j_name)] = (
+ 2
+ * table[(i_name, j_name)]
+ / (label_convert[i]["len"] + label_convert[j]["len"])
+ )
+ else:
+ table[(i_name, j_name)] = (
+ table[(i_name, j_name)] / label_convert[i]["len"]
+ )
+ else:
+ table[(i_name, j_name)] = 0
+
+ if i_name == j_name:
+ table[(i_name, j_name)] = 100
+
+ # print table
+ names = []
+ table_keys = sorted(label_convert.keys())
+
+ for i in table_keys:
+ names.append(label_convert[i]["id"])
+
+ sys.stdout.write("\t" + "\t".join(names) + "\n")
+ for j in table_keys:
+ j_key = label_convert[j]["id"]
+ sys.stdout.write(j_key)
+ for i in table_keys:
+ i_key = label_convert[i]["id"]
+ sys.stdout.write("\t%0.2f" % table[(i_key, j_key)])
+ sys.stdout.write("\n")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Convert XMFA alignments to gff3", prog="xmfa2gff3"
+ )
+ parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File")
+ parser.add_argument(
+ "sequences",
+ type=argparse.FileType("r"),
+ nargs="+",
+ help="Fasta files (in same order) passed to parent for reconstructing proper IDs",
+ )
+ parser.add_argument(
+ "--dice", action="store_true", help="Use dice method for calculating % identity"
+ )
+ parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+
+ args = parser.parse_args()
+
+ total_similarity(**vars(args))
diff -r 000000000000 -r 06d8e28d0bd7 cpt_convert_xmfa/xmfa2tbl.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/xmfa2tbl.xml Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,53 @@
+
+
+
+
+ macros.xml
+ cpt-macros.xml
+
+
+ $output
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+