Mercurial > repos > iuc > progressivemauve
comparison xmfa2gff3.py @ 0:74093fb62bdf draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author | iuc |
---|---|
date | Wed, 17 Aug 2016 14:46:55 -0400 |
parents | |
children | bca52822843e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:74093fb62bdf |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 from Bio import SeqIO | |
4 from Bio.Seq import Seq | |
5 from Bio.SeqRecord import SeqRecord | |
6 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
7 import argparse | |
8 from BCBio import GFF | |
9 import logging | |
10 logging.basicConfig(level=logging.INFO) | |
11 log = logging.getLogger(__name__) | |
12 | |
13 | |
14 def parse_xmfa(xmfa): | |
15 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 | |
16 """ | |
17 current_lcb = [] | |
18 current_seq = {} | |
19 for line in xmfa.readlines(): | |
20 if line.startswith('#'): | |
21 continue | |
22 | |
23 if line.strip() == '=': | |
24 if 'id' in current_seq: | |
25 current_lcb.append(current_seq) | |
26 current_seq = {} | |
27 yield current_lcb | |
28 current_lcb = [] | |
29 else: | |
30 line = line.strip() | |
31 if line.startswith('>'): | |
32 if 'id' in current_seq: | |
33 current_lcb.append(current_seq) | |
34 current_seq = {} | |
35 data = line.strip().split() | |
36 id, loc = data[1].split(':') | |
37 start, end = loc.split('-') | |
38 current_seq = { | |
39 'rid': '_'.join(data[1:]), | |
40 'id': id, | |
41 'start': int(start), | |
42 'end': int(end), | |
43 'strand': 1 if data[2] == '+' else -1, | |
44 'seq': '' | |
45 } | |
46 else: | |
47 current_seq['seq'] += line.strip() | |
48 | |
49 | |
50 def _percent_identity(a, b): | |
51 """Calculate % identity, ignoring gaps in the host sequence | |
52 """ | |
53 match = 0 | |
54 mismatch = 0 | |
55 for char_a, char_b in zip(list(a), list(b)): | |
56 if char_a == '-': | |
57 continue | |
58 if char_a == char_b: | |
59 match += 1 | |
60 else: | |
61 mismatch += 1 | |
62 | |
63 if match + mismatch == 0: | |
64 return 0 | |
65 return 100 * float(match) / (match + mismatch) | |
66 | |
67 | |
68 def _id_tn_dict(sequences): | |
69 """Figure out sequence IDs | |
70 """ | |
71 label_convert = {} | |
72 if sequences is not None: | |
73 if len(sequences) == 1: | |
74 for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')): | |
75 label_convert[str(i + 1)] = record.id | |
76 else: | |
77 for i, sequence in enumerate(sequences): | |
78 for record in SeqIO.parse(sequence, 'fasta'): | |
79 label_convert[str(i + 1)] = record.id | |
80 continue | |
81 return label_convert | |
82 | |
83 | |
84 def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000): | |
85 label_convert = _id_tn_dict(sequences) | |
86 | |
87 lcbs = parse_xmfa(xmfa_file) | |
88 | |
89 records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))] | |
90 for lcb in lcbs: | |
91 ids = [seq['id'] for seq in lcb] | |
92 | |
93 # Doesn't match part of our sequence | |
94 if relative_to not in ids: | |
95 continue | |
96 | |
97 # Skip sequences that are JUST our "relative_to" genome | |
98 if len(ids) == 1: | |
99 continue | |
100 | |
101 parent = [seq for seq in lcb if seq['id'] == relative_to][0] | |
102 others = [seq for seq in lcb if seq['id'] != relative_to] | |
103 | |
104 for other in others: | |
105 other['feature'] = SeqFeature( | |
106 FeatureLocation(parent['start'], parent['end'] + 1), | |
107 type="match", strand=parent['strand'], | |
108 qualifiers={ | |
109 "source": "progressiveMauve", | |
110 "target": label_convert.get(other['id'], other['id']), | |
111 "ID": label_convert.get(other['id'], 'xmfa_' + other['rid']) | |
112 } | |
113 ) | |
114 | |
115 for i in range(0, len(lcb[0]['seq']), window_size): | |
116 block_seq = parent['seq'][i:i + window_size] | |
117 real_window_size = len(block_seq) | |
118 real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i | |
119 real_end = real_start + real_window_size - block_seq.count('-') | |
120 | |
121 if (real_end - real_start) < 10: | |
122 continue | |
123 | |
124 if parent['start'] < 0: | |
125 strand = -1 | |
126 else: | |
127 strand = 1 | |
128 | |
129 for other in others: | |
130 pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size]) | |
131 # Ignore 0% identity sequences | |
132 if pid == 0: | |
133 continue | |
134 other['feature'].sub_features.append( | |
135 SeqFeature( | |
136 FeatureLocation(real_start, real_end), | |
137 type="match_part", strand=strand, | |
138 qualifiers={ | |
139 "source": "progressiveMauve", | |
140 'score': pid | |
141 } | |
142 ) | |
143 ) | |
144 | |
145 for other in others: | |
146 records[0].features.append(other['feature']) | |
147 return records | |
148 | |
149 | |
150 if __name__ == '__main__': | |
151 parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3') | |
152 parser.add_argument('xmfa_file', type=file, help='XMFA File') | |
153 parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000) | |
154 parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1') | |
155 parser.add_argument('--sequences', type=file, nargs='+', | |
156 help='Fasta files (in same order) passed to parent for reconstructing proper IDs') | |
157 parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
158 | |
159 args = parser.parse_args() | |
160 | |
161 result = convert_xmfa_to_gff3(**vars(args)) | |
162 GFF.write(result, sys.stdout) |