Mercurial > repos > cpt > cpt_compare_gbk
comparison cpt_gbk_compare/gbk_compare.py @ 0:fc603e665d75 draft default tip
Uploaded
| author | cpt |
|---|---|
| date | Tue, 21 Jun 2022 19:46:32 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fc603e665d75 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Copyright 2019 Ryan Wick (rrwick@gmail.com) | |
| 4 https://github.com/rrwick/Compare-annotations/ | |
| 5 This program is free software: you can redistribute it and/or modify it under the terms of the GNU | |
| 6 General Public License as published by the Free Software Foundation, either version 3 of the | |
| 7 License, or (at your option) any later version. | |
| 8 This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without | |
| 9 even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 10 General Public License for more details. | |
| 11 You should have received a copy of the GNU General Public License along with this program. If not, | |
| 12 see <https://www.gnu.org/licenses/>. | |
| 13 """ | |
| 14 | |
| 15 import argparse | |
| 16 from Bio import SeqIO | |
| 17 from Bio import pairwise2 | |
| 18 from Bio.pairwise2 import format_alignment | |
| 19 import itertools | |
| 20 import sys | |
| 21 | |
| 22 | |
| 23 def addArr(arrA, arrB): | |
| 24 res = [] | |
| 25 for x in range(0, min(len(arrA), len(arrB))): | |
| 26 res.append(arrA[x] + arrB[x]) | |
| 27 return res | |
| 28 | |
| 29 def get_arguments(): | |
| 30 parser = argparse.ArgumentParser(description='Compare GenBank annotations', | |
| 31 formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| 32 | |
| 33 parser.add_argument('annotation_1', type=str, | |
| 34 help='First annotated genome in Genbank format') | |
| 35 parser.add_argument('annotation_2', type=str, | |
| 36 help='Second annotated genome in Genbank format') | |
| 37 | |
| 38 | |
| 39 parser.add_argument('--match_identity_threshold', type=float, default=0.7, | |
| 40 help='Two genes must have at least this identity to be considerd the same (0.0 to 1.0)') | |
| 41 parser.add_argument('--allowed_skipped_genes', type=int, default=10, | |
| 42 help='This many missing genes are allowed when aligning the annotations') | |
| 43 parser.add_argument("--addNotes", action="store_true", help="Add Note fields") | |
| 44 | |
| 45 parser.add_argument( | |
| 46 "-sumOut", type=argparse.FileType("w"), help="Summary out file" | |
| 47 ) | |
| 48 args = parser.parse_args() | |
| 49 return args | |
| 50 | |
| 51 | |
| 52 def main(): | |
| 53 args = get_arguments() | |
| 54 | |
| 55 # Load in the CDS features from the two assemblies. | |
| 56 old = SeqIO.parse(args.annotation_1, 'genbank') | |
| 57 new = SeqIO.parse(args.annotation_2, 'genbank') | |
| 58 old_record = next(old) | |
| 59 new_record = next(new) | |
| 60 old_features, new_features = [], [] | |
| 61 for f in old_record.features: | |
| 62 if f.type == 'CDS': | |
| 63 old_features.append(f) | |
| 64 for f in new_record.features: | |
| 65 if f.type == 'CDS': | |
| 66 new_features.append(f) | |
| 67 | |
| 68 args.sumOut.write('Features in First Genbank\'s assembly:\t' + str(len(old_features)) + "\n") | |
| 69 args.sumOut.write('Features in Second Genbank\'s assembly:\t' + str(len(new_features)) + "\n\n") | |
| 70 | |
| 71 # Align the features to each other. | |
| 72 offsets = sorted(list(itertools.product(range(args.allowed_skipped_genes + 1), | |
| 73 range(args.allowed_skipped_genes + 1))), | |
| 74 key=lambda x: x[0]+x[1]) | |
| 75 old_i, new_i = 0, 0 | |
| 76 exactRec = 0 | |
| 77 inexactRec = [0, 0 ,0] | |
| 78 hypoRec = [0, 0, 0] | |
| 79 newCount = 0 | |
| 80 oldCount = 0 | |
| 81 if args.addNotes: | |
| 82 print("First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\tFirst Record's Notes\tSecond Record's Notes\n") | |
| 83 else: | |
| 84 print("First Record CDS Product\tSimilarity\tSecond Record CDS Product\tPercent Identity\tLength Difference\tFirst Gbk's CDS Location\tSecond Gbk's CDS Location\tHypothetical Status\n") | |
| 85 while True: | |
| 86 if old_i >= len(old_features) and new_i >= len(new_features): | |
| 87 break | |
| 88 | |
| 89 for old_offset, new_offset in offsets: | |
| 90 try: | |
| 91 old_feature = old_features[old_i + old_offset] | |
| 92 except IndexError: | |
| 93 old_feature = None | |
| 94 try: | |
| 95 new_feature = new_features[new_i + new_offset] | |
| 96 except IndexError: | |
| 97 new_feature = None | |
| 98 try: | |
| 99 match, identity, length_diff = compare_features(old_feature, new_feature, old_record, new_record, args.match_identity_threshold) | |
| 100 except TypeError: | |
| 101 break | |
| 102 if match: | |
| 103 for j in range(old_offset): | |
| 104 print_in_old_not_new(old_features[old_i + j]) | |
| 105 oldCount += 1 | |
| 106 for j in range(new_offset): | |
| 107 print_in_new_not_old(new_features[new_i + j]) | |
| 108 newCount += 1 | |
| 109 if identity == 1.0: | |
| 110 exactRec += 1 | |
| 111 res1, res2 = print_match(old_features[old_i + old_offset], new_features[new_i + new_offset], identity, length_diff, args.addNotes) | |
| 112 inexactRec = addArr(inexactRec, res1) | |
| 113 hypoRec = addArr(hypoRec, res2) | |
| 114 old_i += old_offset | |
| 115 new_i += new_offset | |
| 116 break | |
| 117 else: | |
| 118 sys.stderr.write("Exceeded allowed number of skipped genes (" + str(args.allowed_skipped_genes) + "), unable to maintain alignment and continue comparison.\n") | |
| 119 exit(2) | |
| 120 | |
| 121 if old_feature is None and new_feature is None: | |
| 122 break | |
| 123 | |
| 124 old_i += 1 | |
| 125 new_i += 1 | |
| 126 | |
| 127 args.sumOut.write('Exact Match:\t' + str(exactRec) + "\n\n") | |
| 128 | |
| 129 args.sumOut.write('Inexact Match:\t' + str(inexactRec[0] + inexactRec[1] + inexactRec[2]) + "\n") | |
| 130 args.sumOut.write(' Same length:\t' + str(inexactRec[0]) + "\n") | |
| 131 args.sumOut.write(' Second Gbk Seq longer:\t' + str(inexactRec[2]) + "\n") | |
| 132 args.sumOut.write(' First Gbk Seq longer:\t' + str(inexactRec[1]) + "\n\n") | |
| 133 | |
| 134 args.sumOut.write('In Second Gbk but not in first:\t' + str(newCount) + "\n") | |
| 135 args.sumOut.write('In First Gbk but not in second:\t' + str(oldCount) + "\n\n") | |
| 136 | |
| 137 args.sumOut.write('Hypothetical Annotation Change:\t' + str(hypoRec[1] + hypoRec[2]) + "\n") | |
| 138 args.sumOut.write('Hypothetical:\t' + str(hypoRec[0] + hypoRec[2]) + "\n") | |
| 139 | |
| 140 | |
| 141 def print_match(f1, f2, identity, length_diff, outNotes): | |
| 142 #print('', flush=True) | |
| 143 line = f1.qualifiers['product'][0] + "\t" | |
| 144 matchArr = [0, 0, 0] | |
| 145 hypoArr = [0, 0, 0] | |
| 146 if identity == 1.0: | |
| 147 # print('Exact match') | |
| 148 line += 'Exact match\t' + f2.qualifiers['product'][0] + "\t100.0\tSame Length\t" | |
| 149 else: | |
| 150 # print('Inexact match (' + '%.2f' % (identity * 100.0) + '% ID, ', end='') | |
| 151 line += 'Inexact match\t' + f2.qualifiers['product'][0] + "\t%.2f\t" % (identity * 100.0) | |
| 152 if length_diff == 0: | |
| 153 # print('same length)') | |
| 154 line +="Same Length\t" | |
| 155 matchArr[0] += 1 | |
| 156 elif length_diff > 0: | |
| 157 # print('old seq longer)') | |
| 158 line +="First Gbk Seq Longer\t" | |
| 159 matchArr[1] += 1 | |
| 160 elif length_diff < 0: | |
| 161 # print('new seq longer)') | |
| 162 line +="Second Gbk Seq Longer\t" | |
| 163 matchArr[2] += 1 | |
| 164 # print(' old: ', end='') | |
| 165 # print_feature_one_line(f1) | |
| 166 line += print_feature_one_line(f1) + "\t" | |
| 167 # print(' new: ', end='') | |
| 168 # print_feature_one_line(f2) | |
| 169 line += print_feature_one_line(f2) + "\t" | |
| 170 p1 = f1.qualifiers['product'][0].lower() | |
| 171 p2 = f2.qualifiers['product'][0].lower() | |
| 172 if 'hypothetical' in p1 and 'hypothetical' in p2: | |
| 173 # print(' still hypothetical') | |
| 174 line += "Hypothetical\t" | |
| 175 hypoArr[0] += 1 | |
| 176 elif 'hypothetical' in p1 and 'hypothetical' not in p2: | |
| 177 # print(' no longer hypothetical') | |
| 178 line += "No Longer Hypothetical\t" | |
| 179 hypoArr[1] += 1 | |
| 180 elif 'hypothetical' not in p1 and 'hypothetical' in p2: | |
| 181 # print(' became hypothetical') | |
| 182 line += "Became Hypothetical\t" | |
| 183 hypoArr[2] += 1 | |
| 184 else: | |
| 185 line += "'Hypothetical' not in second nor first Gbk's product tag" | |
| 186 | |
| 187 if outNotes: | |
| 188 line += "\t" | |
| 189 if "note" in f1.qualifiers.keys(): | |
| 190 for x in f1.qualifiers["note"]: | |
| 191 line += x | |
| 192 line += "\t" | |
| 193 else: | |
| 194 line += "N/A\t" | |
| 195 if "note" in f2.qualifiers.keys(): | |
| 196 for x in f2.qualifiers["note"]: | |
| 197 line += x | |
| 198 else: | |
| 199 line += "N/A" | |
| 200 | |
| 201 print(line) | |
| 202 return matchArr, hypoArr | |
| 203 | |
| 204 def print_in_old_not_new(f): # rename file outputs | |
| 205 line = f.qualifiers['product'][0] + "\tIn First Gbk but not Second\tN/A\t0.00\t" + str(f.location.end - f.location.start) + "\t" + print_feature_one_line(f) + "\tN/A\tN/A" | |
| 206 # print('') | |
| 207 # print('In old but not in new:') | |
| 208 # print(' ', end='') | |
| 209 # print_feature_one_line(f) | |
| 210 print(line) | |
| 211 | |
| 212 | |
| 213 def print_in_new_not_old(f): # rename file outputs | |
| 214 line = "N/A\tIn Second Gbk but not First\t" + f.qualifiers['product'][0] + "\t0.00\t" + str(f.location.end - f.location.start) + "\tN/A\t" + print_feature_one_line(f) + "\tN/A" | |
| 215 #print('') | |
| 216 #print('In new but not in old:') | |
| 217 #print(' ', end='') | |
| 218 #print_feature_one_line(f) | |
| 219 print(line) | |
| 220 | |
| 221 | |
| 222 def print_feature_one_line(f): | |
| 223 #f_str = f.qualifiers['product'][0] | |
| 224 f_str = "" | |
| 225 strand = '+' if f.location.strand == 1 else '-' | |
| 226 f_str += '(' + str(f.location.start) + '-' + str(f.location.end) + ' ' + strand + ', ' | |
| 227 f_str += str(f.location.end - f.location.start) + ' bp)' | |
| 228 return(f_str) | |
| 229 | |
| 230 | |
| 231 def compare_features(f1, f2, r1, r2, match_identity_threshold): | |
| 232 if f1 is None or f2 is None: | |
| 233 return False | |
| 234 | |
| 235 s1 = f1.extract(r1).seq | |
| 236 s2 = f2.extract(r2).seq | |
| 237 score = pairwise2.align.globalms(s1, s2, 1, 0, 0, 0, score_only=True) | |
| 238 identity = score / max(len(s1), len(s2)) | |
| 239 match = identity >= match_identity_threshold | |
| 240 length_diff = len(s1) - len(s2) | |
| 241 return match, identity, length_diff | |
| 242 | |
| 243 | |
| 244 if __name__ == '__main__': | |
| 245 main() |
