annotate cpt_convert_xmfa/xmfa2tbl.py @ 0:06d8e28d0bd7 draft default tip

Uploaded
author cpt
date Fri, 10 Jun 2022 08:49:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
2 from Bio import SeqIO
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
3 import sys
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
4 from xmfa import parse_xmfa, percent_identity
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
5 import argparse
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
6 import logging
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
7 import itertools
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
8 from collections import Counter
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
9
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
10 logging.basicConfig(level=logging.INFO)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
11 log = logging.getLogger(__name__)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
12
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
13
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
14 def _id_tn_dict(sequences):
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
15 """Figure out sequence IDs AND sequence lengths from fasta file
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
16 """
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
17 label_convert = {}
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
18 if sequences is not None:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
19 if len(sequences) == 1:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
20 for i, record in enumerate(SeqIO.parse(sequences[0], "fasta")):
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
21 label_convert[str(i + 1)] = {"id": record.id, "len": len(record)}
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
22 else:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
23 for i, sequence in enumerate(sequences):
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
24 for record in SeqIO.parse(sequence, "fasta"):
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
25 label_convert[str(i + 1)] = {"id": record.id, "len": len(record)}
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
26 continue
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
27 # check for repeated sequence IDs
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
28 id_dupes = Counter(list(x["id"] for x in label_convert.values()))
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
29 for dupe in id_dupes:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
30 if id_dupes[dupe] > 1:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
31 log.debug("Duplicate FASTA ID Found: %s\n" % (dupe))
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
32 for xmfaid in label_convert.keys():
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
33 # Change the duplicate labels to have the XMFA identifer
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
34 # as a prefix so not to break the table generation later
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
35 if label_convert[xmfaid]["id"] == dupe:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
36 label_convert[xmfaid]["id"] = "%s_%s" % (xmfaid, dupe)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
37
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
38 return label_convert
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
39
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
40
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
41 def total_similarity(xmfa_file, sequences=None, dice=False):
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
42 if sequences is None:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
43 raise Exception("Must provide a non-zero number of sequence files")
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
44
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
45 label_convert = _id_tn_dict(sequences)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
46 lcbs = parse_xmfa(xmfa_file)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
47
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
48 # make a matrix based on number of sequences
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
49 table = {}
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
50
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
51 for lcb in lcbs:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
52 # ignore LCBs containing only one sequence
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
53 if len(lcb) == 0:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
54 continue
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
55
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
56 # permutations based on num sequences to compare for current LCB
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
57 compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2))
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
58 for permutation in compare_seqs:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
59 (i, j) = permutation
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
60 similarity = percent_identity(lcb[i]["seq"], lcb[j]["seq"])
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
61
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
62 i_name = label_convert[lcb[i]["id"]]["id"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
63 j_name = label_convert[lcb[j]["id"]]["id"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
64 # find length of sequence in LCB
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
65 length_seq_lcb = lcb[i]["end"] - (lcb[i]["start"] - 1)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
66 # populate table with normalized similarity value based on length_seq_lcb
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
67 if (i_name, j_name) not in table:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
68 table[(i_name, j_name)] = 0
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
69 table[(i_name, j_name)] += length_seq_lcb * similarity
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
70
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
71 # finalize total percent similarity by dividing by length of parent sequence
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
72 for i in label_convert.keys():
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
73 for j in label_convert.keys():
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
74 i_name = label_convert[i]["id"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
75 j_name = label_convert[j]["id"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
76 if (i_name, j_name) in table:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
77 if dice:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
78 table[(i_name, j_name)] = (
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
79 2
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
80 * table[(i_name, j_name)]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
81 / (label_convert[i]["len"] + label_convert[j]["len"])
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
82 )
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
83 else:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
84 table[(i_name, j_name)] = (
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
85 table[(i_name, j_name)] / label_convert[i]["len"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
86 )
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
87 else:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
88 table[(i_name, j_name)] = 0
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
89
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
90 if i_name == j_name:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
91 table[(i_name, j_name)] = 100
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
92
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
93 # print table
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
94 names = []
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
95 table_keys = sorted(label_convert.keys())
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
96
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
97 for i in table_keys:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
98 names.append(label_convert[i]["id"])
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
99
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
100 sys.stdout.write("\t" + "\t".join(names) + "\n")
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
101 for j in table_keys:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
102 j_key = label_convert[j]["id"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
103 sys.stdout.write(j_key)
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
104 for i in table_keys:
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
105 i_key = label_convert[i]["id"]
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
106 sys.stdout.write("\t%0.2f" % table[(i_key, j_key)])
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
107 sys.stdout.write("\n")
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
108
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
109
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
110 if __name__ == "__main__":
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
111 parser = argparse.ArgumentParser(
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
112 description="Convert XMFA alignments to gff3", prog="xmfa2gff3"
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
113 )
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
114 parser.add_argument("xmfa_file", type=argparse.FileType("r"), help="XMFA File")
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
115 parser.add_argument(
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
116 "sequences",
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
117 type=argparse.FileType("r"),
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
118 nargs="+",
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
119 help="Fasta files (in same order) passed to parent for reconstructing proper IDs",
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
120 )
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
121 parser.add_argument(
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
122 "--dice", action="store_true", help="Use dice method for calculating % identity"
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
123 )
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
124 parser.add_argument("--version", action="version", version="%(prog)s 1.0")
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
125
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
126 args = parser.parse_args()
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
127
06d8e28d0bd7 Uploaded
cpt
parents:
diff changeset
128 total_similarity(**vars(args))