Mercurial > repos > cpt > cpt_xmfa_to_table
changeset 0:06d8e28d0bd7 draft default tip
Uploaded
author | cpt |
---|---|
date | Fri, 10 Jun 2022 08:49:43 +0000 |
parents | |
children | |
files | cpt_convert_xmfa/cpt-macros.xml cpt_convert_xmfa/macros.xml cpt_convert_xmfa/test-data/test.fa cpt_convert_xmfa/test-data/test.xmfa cpt_convert_xmfa/test-data/xmfa2tbl_out.tsv cpt_convert_xmfa/test-data/xmfa2tbl_out_dice.tsv cpt_convert_xmfa/xmfa.py cpt_convert_xmfa/xmfa2tbl.py cpt_convert_xmfa/xmfa2tbl.xml |
diffstat | 9 files changed, 540 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/cpt-macros.xml Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/macros.xml Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,58 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <requirement type="package" version="3.8.13">python</requirement> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + + <token name="@XMFA_INPUT@"> + "$xmfa" + </token> + <xml name="xmfa_input" + token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA" /> + </xml> + + <token name="@XMFA_FA_INPUT@"> + "$sequences" + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" + help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + + </xml> + <xml name="genome_selector"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input" /> + <expand macro="genome_selector" /> + </xml> + <token name="@INPUT_GFF@"> + "$gff3_data" + </token> + <token name="@INPUT_FASTA@"> + genomeref.fa + </token> + <token name="@GENOME_SELECTOR_PRE@"> + ln -s $genome_fasta genomeref.fa; + </token> + <token name="@GENOME_SELECTOR@"> + genomeref.fa + </token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/test-data/test.fa Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,6 @@ +>A [Orig=NODE_1_length_60_cov_149.246_ID_1] +GGGGGGGGGGAACAGATAGATTTTTAGATAGACAGATAGAAATGAATGAATGAATGAATG +>B [Orig=NODE_1_length_38_cov_140.805_ID_1] +GGGGGGGGGGGGGGGGGGGGAATGAATGAAAATGAATG +>C [Orig=NODE_1_length_25_cov_144.849_ID_1] +TTTTTGGGGGGAAGGCCCCCTTGGT \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/test-data/test.xmfa Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,28 @@ +#FormatVersion Mauve1 +#Sequence1File test.fa +#Sequence1Entry 1 +#Sequence1Format FastA +#Sequence2File test.fa +#Sequence2Entry 2 +#Sequence2Format FastA +#Sequence3File test.fa +#Sequence3Entry 3 +#Sequence3Format FastA +#BackboneFile test.xmfa.bbcols +> 1:1-10 + test.fa +-----GGGGGGGGGG +> 2:1-10 + test.fa +-----GGGGGGGGGG +> 3:6-15 + test.fa +-----GGGGGGAAGG += +> 1:41-60 + test.fa +AATGAATGAATGAATGAATG +> 2:21-38 + test.fa +AATGAATGAA--AATGAATG += +> 1:21-25 + test.fa +TTTTT +> 3:21-25 + test.fa +TTGGT += \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/test-data/xmfa2tbl_out.tsv Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,4 @@ + A B C +A 100.00 73.68 44.00 +B 46.67 100.00 32.00 +C 18.33 21.05 100.00
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/test-data/xmfa2tbl_out_dice.tsv Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,4 @@ + A B C +A 100.00 57.14 25.88 +B 57.14 100.00 25.40 +C 25.88 25.40 100.00
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/xmfa.py Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,144 @@ +# This file is licensed seperately of the rest of the codebase. This is due to +# BioPython's failure to merge https://github.com/biopython/biopython/pull/544 +# in a timely fashion. Please use this file however you see fit! +# +# +# Copyright (c) 2015-2017 Center for Phage Technology. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY CENTER FOR PHAGE TECHNOLOGY "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL CENTER FOR PHAGE TECHNOLOGY OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from Bio import SeqIO +import tempfile +import sys + + +def parse_xmfa(xmfa): + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 + """ + current_lcb = [] + current_seq = {} + for line in xmfa.readlines(): + if line.startswith("#"): + continue + + if line.strip() == "=": + if "id" in current_seq: + current_lcb.append(current_seq) + current_seq = {} + yield current_lcb + current_lcb = [] + else: + line = line.strip() + if line.startswith(">"): + if "id" in current_seq: + current_lcb.append(current_seq) + current_seq = {} + data = line.strip().split() + # 0 1 2 3 4 5 + # > 1:5986-6406 + CbK.fa # CbK_gp011 + id, loc = data[1].split(":") + start, end = loc.split("-") + current_seq = { + "rid": "_".join(data[1:]), + "id": id, + "start": int(start), + "end": int(end), + "strand": 1 if data[2] == "+" else -1, + "file": data[3], + "seq": "", + "comment": "", + } + if len(data) > 5: + current_seq["comment"] = " ".join(data[5:]) + else: + current_seq["seq"] += line.strip() + + +HEADER_TPL = "> {id}:{start}-{end} {strand} {file} # {comment}\n" + + +def split_by_n(seq, n): + """A generator to divide a sequence into chunks of n units.""" + # http://stackoverflow.com/questions/9475241/split-python-string-every-nth-character + while seq: + yield seq[:n] + seq = seq[n:] + + +def to_xmfa(lcbs, handle=sys.stdout): + handle.write("#FormatVersion Mauve1\n") + for lcb in lcbs: + for aln in lcb: + handle.write( + HEADER_TPL.format( + id=aln["id"], + start=aln["start"], + end=aln["end"], + strand="+" if aln["strand"] > 0 else "-", + file=aln["file"], + comment=aln["comment"], + ) + ) + + for line in split_by_n(aln["seq"], 80): + handle.write(line + "\n") + handle.write("=\n") + + +def percent_identity(a, b): + """Calculate % identity, ignoring gaps in the host sequence + """ + match = 0 + mismatch = 0 + for char_a, char_b in zip(list(a), list(b)): + if char_a == "-": + continue + if char_a == char_b: + match += 1 + else: + mismatch += 1 + + if match + mismatch == 0: + return 0.0 + return 100 * float(match) / (match + mismatch) + + +def id_tn_dict(sequences, tmpfile=False): + """Figure out sequence IDs + """ + label_convert = {} + correct_chrom = None + if not isinstance(sequences, list): + sequences = [sequences] + + i = 0 + for sequence_file in sequences: + for record in SeqIO.parse(sequence_file, "fasta"): + if correct_chrom is None: + correct_chrom = record.id + + i += 1 + key = str(i) + label_convert[key] = {"record_id": record.id, "len": len(record.seq)} + + if tmpfile: + label_convert[key] = tempfile.NamedTemporaryFile(delete=False) + + return label_convert
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/xmfa2tbl.py Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,128 @@ +#!/usr/bin/env python +from Bio import SeqIO +import sys +from xmfa import parse_xmfa, percent_identity +import argparse +import logging +import itertools +from collections import Counter + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def _id_tn_dict(sequences): + """Figure out sequence IDs AND sequence lengths from fasta file + """ + label_convert = {} + if sequences is not None: + if len(sequences) == 1: + for i, record in enumerate(SeqIO.parse(sequences[0], "fasta")): + label_convert[str(i + 1)] = {"id": record.id, "len": len(record)} + else: + for i, sequence in enumerate(sequences): + for record in SeqIO.parse(sequence, "fasta"): + label_convert[str(i + 1)] = {"id": record.id, "len": len(record)} + continue + # check for repeated sequence IDs + id_dupes = Counter(list(x["id"] for x in label_convert.values())) + for dupe in id_dupes: + if id_dupes[dupe] > 1: + log.debug("Duplicate FASTA ID Found: %s\n" % (dupe)) + for xmfaid in label_convert.keys(): + # Change the duplicate labels to have the XMFA identifer + # as a prefix so not to break the table generation later + if label_convert[xmfaid]["id"] == dupe: + label_convert[xmfaid]["id"] = "%s_%s" % (xmfaid, dupe) + + return label_convert + + +def total_similarity(xmfa_file, sequences=None, dice=False): + if sequences is None: + raise Exception("Must provide a non-zero number of sequence files") + + label_convert = _id_tn_dict(sequences) + lcbs = parse_xmfa(xmfa_file) + + # make a matrix based on number of sequences + table = {} + + for lcb in lcbs: + # ignore LCBs containing only one sequence + if len(lcb) == 0: + continue + + # permutations based on num sequences to compare for current LCB + compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2)) + for permutation in compare_seqs: + (i, j) = permutation + similarity = percent_identity(lcb[i]["seq"], lcb[j]["seq"]) + + i_name = label_convert[lcb[i]["id"]]["id"] + j_name = label_convert[lcb[j]["id"]]["id"] + # find length of sequence in LCB + length_seq_lcb = lcb[i]["end"] - (lcb[i]["start"] - 1) + # populate table with normalized similarity value based on length_seq_lcb + if (i_name, j_name) not in table: + table[(i_name, j_name)] = 0 + table[(i_name, j_name)] += length_seq_lcb * similarity + + # finalize total percent similarity by dividing by length of parent sequence + for i in label_convert.keys(): + for j in label_convert.keys(): + i_name = label_convert[i]["id"] + j_name = label_convert[j]["id"] + if (i_name, j_name) in table: + if dice: + table[(i_name, j_name)] = ( + 2 + * table[(i_name, j_name)] + / (label_convert[i]["len"] + label_convert[j]["len"]) + ) + else: + table[(i_name, j_name)] = ( + table[(i_name, j_name)] / label_convert[i]["len"] + ) + else: + table[(i_name, j_name)] = 0 + + if i_name == j_name: + table[(i_name, j_name)] = 100 + + # print table + names = [] + table_keys = sorted(label_convert.keys()) + + for i in table_keys: + names.append(label_convert[i]["id"]) + + sys.stdout.write("\t" + "\t".join(names) + "\n") + for j in table_keys: + j_key = label_convert[j]["id"] + sys.stdout.write(j_key) + for i in table_keys: + i_key = label_convert[i]["id"] + sys.stdout.write("\t%0.2f" % table[(i_key, j_key)]) + sys.stdout.write("\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert XMFA alignments to gff3", prog="xmfa2gff3" + ) + parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File") + parser.add_argument( + "sequences", + type=argparse.FileType("r"), + nargs="+", + help="Fasta files (in same order) passed to parent for reconstructing proper IDs", + ) + parser.add_argument( + "--dice", action="store_true", help="Use dice method for calculating % identity" + ) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + + args = parser.parse_args() + + total_similarity(**vars(args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_convert_xmfa/xmfa2tbl.xml Fri Jun 10 08:49:43 2022 +0000 @@ -0,0 +1,53 @@ +<?xml version="1.0"?> +<tool id="xmfa2tbl" name="Convert XMFA to percent identity table" version="19.1.0.0"> + <description></description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +python $__tool_directory__/xmfa2tbl.py +$dice +@XMFA_INPUT@ +@XMFA_FA_INPUT@ + +> $output +]]></command> + <inputs> + <expand macro="xmfa_input" /> + <expand macro="xmfa_fa_input" /> + <param type="boolean" label="use dice method in percent similarity calculation" + truevalue="--dice" falsevalue="" name="dice" help="The dice method alters the total similarity calculation to reflect the length of both sequences. The default for this option is true."/> + </inputs> + <outputs> + <data format="tabular" name="output" /> + </outputs> + <tests> + <test> + <param name="xmfa" value="test.xmfa"/> + <param name="sequences" value="test.fa" /> + <output name="output" file="xmfa2tbl_out.tsv"/> + </test> + <test> + <param name="dice" value="true"/> + <param name="xmfa" value="test.xmfa"/> + <param name="sequences" value="test.fa" /> + <output name="output" file="xmfa2tbl_out_dice.tsv"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +This tool compares nucleotide sequences within an input XMFA file and outputs a table reflecting +the percent nucleotide identity between every sequence pair. Total similarity is based on +regions of similarity called locally collinear blocks, or LCBs. There is no penalty for gaps. + +**Options** +The dice method uses the following formula to normalize considering both sequences in the pairwise comparison:: + + 2 * number of identical matches / (query sequence length + subject sequence length) + +]]></help> + <expand macro="citations-2020" /> +</tool>