Mercurial > repos > cpt > cpt_xmfa_to_table

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/cpt-macros.xml	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/macros.xml	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,58 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="requirements">
+		<requirements>
+			<requirement type="package">progressivemauve</requirement>
+			<requirement type="package" version="3.8.13">python</requirement>
+			<requirement type="package" version="1.79">biopython</requirement>
+			<requirement type="package" version="1.2.2">cpt_gffparser</requirement>
+			<yield/>
+		</requirements>
+	</xml>
+	<token name="@WRAPPER_VERSION@">2.4.0</token>
+	<xml name="citation/progressive_mauve">
+		<citation type="doi">10.1371/journal.pone.0011147</citation>
+	</xml>
+	<xml name="citation/gepard">
+		<citation type="doi">10.1093/bioinformatics/btm039</citation>
+	</xml>
+
+	<token name="@XMFA_INPUT@">
+		"$xmfa"
+	</token>
+	<xml name="xmfa_input"
+		token_formats="xmfa">
+		<param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA" />
+	</xml>
+
+	<token name="@XMFA_FA_INPUT@">
+		"$sequences"
+	</token>
+	<xml name="xmfa_fa_input">
+		<param type="data" format="fasta" name="sequences" label="Sequences in alignment"
+			help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/>
+
+	</xml>
+	<xml name="genome_selector">
+	    <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+	</xml>
+	<xml name="gff3_input">
+		<param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+	</xml>
+	<xml name="input/gff3+fasta">
+		<expand macro="gff3_input" />
+		<expand macro="genome_selector" />
+	</xml>
+	<token name="@INPUT_GFF@">
+	"$gff3_data"
+	</token>
+	<token name="@INPUT_FASTA@">
+		genomeref.fa
+	</token>
+	<token name="@GENOME_SELECTOR_PRE@">
+		ln -s $genome_fasta genomeref.fa;
+	</token>
+	<token name="@GENOME_SELECTOR@">
+		genomeref.fa
+	</token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/test.fa	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,6 @@
+>A  [Orig=NODE_1_length_60_cov_149.246_ID_1]
+GGGGGGGGGGAACAGATAGATTTTTAGATAGACAGATAGAAATGAATGAATGAATGAATG
+>B  [Orig=NODE_1_length_38_cov_140.805_ID_1]
+GGGGGGGGGGGGGGGGGGGGAATGAATGAAAATGAATG
+>C  [Orig=NODE_1_length_25_cov_144.849_ID_1]
+TTTTTGGGGGGAAGGCCCCCTTGGT
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/test.xmfa	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,28 @@
+#FormatVersion Mauve1
+#Sequence1File	test.fa
+#Sequence1Entry	1
+#Sequence1Format	FastA
+#Sequence2File	test.fa
+#Sequence2Entry	2
+#Sequence2Format	FastA
+#Sequence3File	test.fa
+#Sequence3Entry	3
+#Sequence3Format	FastA
+#BackboneFile	test.xmfa.bbcols
+> 1:1-10 + test.fa
+-----GGGGGGGGGG
+> 2:1-10 + test.fa
+-----GGGGGGGGGG
+> 3:6-15 + test.fa
+-----GGGGGGAAGG
+=
+> 1:41-60 + test.fa
+AATGAATGAATGAATGAATG
+> 2:21-38 + test.fa
+AATGAATGAA--AATGAATG
+=
+> 1:21-25 + test.fa
+TTTTT
+> 3:21-25 + test.fa
+TTGGT
+=
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/xmfa2tbl_out.tsv	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,4 @@
+	A	B	C
+A	100.00	73.68	44.00
+B	46.67	100.00	32.00
+C	18.33	21.05	100.00
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/test-data/xmfa2tbl_out_dice.tsv	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,4 @@
+	A	B	C
+A	100.00	57.14	25.88
+B	57.14	100.00	25.40
+C	25.88	25.40	100.00
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/xmfa.py	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,144 @@
+# This file is licensed seperately of the rest of the codebase. This is due to
+# BioPython's failure to merge https://github.com/biopython/biopython/pull/544
+# in a timely fashion. Please use this file however you see fit!
+#
+#
+# Copyright (c) 2015-2017 Center for Phage Technology. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY CENTER FOR PHAGE TECHNOLOGY "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL CENTER FOR PHAGE TECHNOLOGY OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from Bio import SeqIO
+import tempfile
+import sys
+
+
+def parse_xmfa(xmfa):
+    """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
+    """
+    current_lcb = []
+    current_seq = {}
+    for line in xmfa.readlines():
+        if line.startswith("#"):
+            continue
+
+        if line.strip() == "=":
+            if "id" in current_seq:
+                current_lcb.append(current_seq)
+                current_seq = {}
+            yield current_lcb
+            current_lcb = []
+        else:
+            line = line.strip()
+            if line.startswith(">"):
+                if "id" in current_seq:
+                    current_lcb.append(current_seq)
+                    current_seq = {}
+                data = line.strip().split()
+                # 0 1           2 3      4 5
+                # > 1:5986-6406 + CbK.fa # CbK_gp011
+                id, loc = data[1].split(":")
+                start, end = loc.split("-")
+                current_seq = {
+                    "rid": "_".join(data[1:]),
+                    "id": id,
+                    "start": int(start),
+                    "end": int(end),
+                    "strand": 1 if data[2] == "+" else -1,
+                    "file": data[3],
+                    "seq": "",
+                    "comment": "",
+                }
+                if len(data) > 5:
+                    current_seq["comment"] = " ".join(data[5:])
+            else:
+                current_seq["seq"] += line.strip()
+
+
+HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n"
+
+
+def split_by_n(seq, n):
+    """A generator to divide a sequence into chunks of n units."""
+    # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character
+    while seq:
+        yield seq[:n]
+        seq = seq[n:]
+
+
+def to_xmfa(lcbs, handle=sys.stdout):
+    handle.write("#FormatVersion Mauve1\n")
+    for lcb in lcbs:
+        for aln in lcb:
+            handle.write(
+                HEADER_TPL.format(
+                    id=aln["id"],
+                    start=aln["start"],
+                    end=aln["end"],
+                    strand="+" if aln["strand"] > 0 else "-",
+                    file=aln["file"],
+                    comment=aln["comment"],
+                )
+            )
+
+            for line in split_by_n(aln["seq"], 80):
+                handle.write(line + "\n")
+        handle.write("=\n")
+
+
+def percent_identity(a, b):
+    """Calculate % identity, ignoring gaps in the host sequence
+    """
+    match = 0
+    mismatch = 0
+    for char_a, char_b in zip(list(a), list(b)):
+        if char_a == "-":
+            continue
+        if char_a == char_b:
+            match += 1
+        else:
+            mismatch += 1
+
+    if match + mismatch == 0:
+        return 0.0
+    return 100 * float(match) / (match + mismatch)
+
+
+def id_tn_dict(sequences, tmpfile=False):
+    """Figure out sequence IDs
+    """
+    label_convert = {}
+    correct_chrom = None
+    if not isinstance(sequences, list):
+        sequences = [sequences]
+
+    i = 0
+    for sequence_file in sequences:
+        for record in SeqIO.parse(sequence_file, "fasta"):
+            if correct_chrom is None:
+                correct_chrom = record.id
+
+            i += 1
+            key = str(i)
+            label_convert[key] = {"record_id": record.id, "len": len(record.seq)}
+
+            if tmpfile:
+                label_convert[key] = tempfile.NamedTemporaryFile(delete=False)
+
+    return label_convert
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/xmfa2tbl.py	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+from Bio import SeqIO
+import sys
+from xmfa import parse_xmfa, percent_identity
+import argparse
+import logging
+import itertools
+from collections import Counter
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def _id_tn_dict(sequences):
+    """Figure out sequence IDs AND sequence lengths from fasta file
+    """
+    label_convert = {}
+    if sequences is not None:
+        if len(sequences) == 1:
+            for i, record in enumerate(SeqIO.parse(sequences[0], "fasta")):
+                label_convert[str(i + 1)] = {"id": record.id, "len": len(record)}
+        else:
+            for i, sequence in enumerate(sequences):
+                for record in SeqIO.parse(sequence, "fasta"):
+                    label_convert[str(i + 1)] = {"id": record.id, "len": len(record)}
+                    continue
+    # check for repeated sequence IDs
+    id_dupes = Counter(list(x["id"] for x in label_convert.values()))
+    for dupe in id_dupes:
+        if id_dupes[dupe] > 1:
+            log.debug("Duplicate FASTA ID Found: %s\n" % (dupe))
+            for xmfaid in label_convert.keys():
+                # Change the duplicate labels to have the XMFA identifer
+                # as a prefix so not to break the table generation later
+                if label_convert[xmfaid]["id"] == dupe:
+                    label_convert[xmfaid]["id"] = "%s_%s" % (xmfaid, dupe)
+
+    return label_convert
+
+
+def total_similarity(xmfa_file, sequences=None, dice=False):
+    if sequences is None:
+        raise Exception("Must provide a non-zero number of sequence files")
+
+    label_convert = _id_tn_dict(sequences)
+    lcbs = parse_xmfa(xmfa_file)
+
+    # make a matrix based on number of sequences
+    table = {}
+
+    for lcb in lcbs:
+        # ignore LCBs containing only one sequence
+        if len(lcb) == 0:
+            continue
+
+        # permutations based on num sequences to compare for current LCB
+        compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2))
+        for permutation in compare_seqs:
+            (i, j) = permutation
+            similarity = percent_identity(lcb[i]["seq"], lcb[j]["seq"])
+
+            i_name = label_convert[lcb[i]["id"]]["id"]
+            j_name = label_convert[lcb[j]["id"]]["id"]
+            # find length of sequence in LCB
+            length_seq_lcb = lcb[i]["end"] - (lcb[i]["start"] - 1)
+            # populate table with normalized similarity value based on length_seq_lcb
+            if (i_name, j_name) not in table:
+                table[(i_name, j_name)] = 0
+            table[(i_name, j_name)] += length_seq_lcb * similarity
+
+    # finalize total percent similarity by dividing by length of parent sequence
+    for i in label_convert.keys():
+        for j in label_convert.keys():
+            i_name = label_convert[i]["id"]
+            j_name = label_convert[j]["id"]
+            if (i_name, j_name) in table:
+                if dice:
+                    table[(i_name, j_name)] = (
+                        2
+                        * table[(i_name, j_name)]
+                        / (label_convert[i]["len"] + label_convert[j]["len"])
+                    )
+                else:
+                    table[(i_name, j_name)] = (
+                        table[(i_name, j_name)] / label_convert[i]["len"]
+                    )
+            else:
+                table[(i_name, j_name)] = 0
+
+            if i_name == j_name:
+                table[(i_name, j_name)] = 100
+
+    # print table
+    names = []
+    table_keys = sorted(label_convert.keys())
+
+    for i in table_keys:
+        names.append(label_convert[i]["id"])
+
+    sys.stdout.write("\t" + "\t".join(names) + "\n")
+    for j in table_keys:
+        j_key = label_convert[j]["id"]
+        sys.stdout.write(j_key)
+        for i in table_keys:
+            i_key = label_convert[i]["id"]
+            sys.stdout.write("\t%0.2f" % table[(i_key, j_key)])
+        sys.stdout.write("\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert XMFA alignments to gff3", prog="xmfa2gff3"
+    )
+    parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File")
+    parser.add_argument(
+        "sequences",
+        type=argparse.FileType("r"),
+        nargs="+",
+        help="Fasta files (in same order) passed to parent for reconstructing proper IDs",
+    )
+    parser.add_argument(
+        "--dice", action="store_true", help="Use dice method for calculating % identity"
+    )
+    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+
+    args = parser.parse_args()
+
+    total_similarity(**vars(args))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_convert_xmfa/xmfa2tbl.xml	Fri Jun 10 08:49:43 2022 +0000
@@ -0,0 +1,53 @@
+<?xml version="1.0"?>
+<tool id="xmfa2tbl" name="Convert XMFA to percent identity table" version="19.1.0.0">
+	<description></description>
+	<macros>
+		<import>macros.xml</import>
+		<import>cpt-macros.xml</import>
+	</macros>
+	<expand macro="requirements"/>
+	<command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/xmfa2tbl.py
+$dice
+@XMFA_INPUT@
+@XMFA_FA_INPUT@
+
+> $output
+]]></command>
+	<inputs>
+		<expand macro="xmfa_input" />
+		<expand macro="xmfa_fa_input" />
+		<param type="boolean" label="use dice method in percent similarity calculation"
+			truevalue="--dice" falsevalue="" name="dice" help="The dice method alters the total similarity calculation to reflect the length of both sequences. The default for this option is true."/>
+	</inputs>
+	<outputs>
+		<data format="tabular" name="output" />
+	</outputs>
+	<tests>
+		<test>
+			<param name="xmfa" value="test.xmfa"/>
+			<param name="sequences" value="test.fa" />
+			<output name="output" file="xmfa2tbl_out.tsv"/>
+		</test>
+		<test>
+			<param name="dice" value="true"/>
+			<param name="xmfa" value="test.xmfa"/>
+			<param name="sequences" value="test.fa" />
+			<output name="output" file="xmfa2tbl_out_dice.tsv"/>
+		</test>
+	</tests>
+	<help><![CDATA[
+**What it does**
+
+This tool compares nucleotide sequences within an input XMFA file and outputs a table reflecting
+the percent nucleotide identity between every sequence pair. Total similarity is based on
+regions of similarity called locally collinear blocks, or LCBs. There is no penalty for gaps.
+
+**Options**
+The dice method uses the following formula to normalize considering both sequences in the pairwise comparison::
+
+	2 * number of identical matches / (query sequence length + subject sequence length)
+
+]]></help>
+		<expand macro="citations-2020" />
+</tool>