Mercurial > repos > cpt > cpt_search_file
comparison searchFile.py @ 1:6e3a843b6304 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:53:18 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:bd2ff2c7e806 | 1:6e3a843b6304 |
|---|---|
| 1 ##### User input File(s), that are BLAST XML, gff3, and/or Genbank and then searched for containing user designated terms | |
| 2 | |
| 3 import argparse | |
| 4 import explodeJSON as ej | |
| 5 import gffutils # THIS IS REQUIREMENT | |
| 6 from Bio.Blast import NCBIXML | |
| 7 from Bio import SeqIO | |
| 8 import re | |
| 9 import os | |
| 10 | |
| 11 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) | |
| 12 | |
| 13 ####### TERM FUNCTIONS | |
| 14 def dbaseTerms(terms, galaxy=True): | |
| 15 """Index into dictionary object and retrieve all desired terms""" | |
| 16 db_path = os.path.join(SCRIPT_DIR, "data/lysis-family-v1.0.3.json") | |
| 17 db = ej.explodeJSON(db_path) | |
| 18 db = db.readJSON() | |
| 19 dbase_terms = [] | |
| 20 if terms: | |
| 21 for term in terms: | |
| 22 index_list = term.split(",") | |
| 23 for t in index_list: | |
| 24 if t != "None": | |
| 25 dbase_terms.extend(db[t]) | |
| 26 else: | |
| 27 dbase_terms = [] | |
| 28 return dbase_terms | |
| 29 else: | |
| 30 pass | |
| 31 | |
| 32 | |
| 33 def userTerms(file, text): | |
| 34 """Select terms input by user""" | |
| 35 user_terms = [] | |
| 36 if file: | |
| 37 terms = open(file.name).read().splitlines() | |
| 38 user_terms.extend(terms) | |
| 39 else: | |
| 40 pass | |
| 41 if text: | |
| 42 if re.search(("__cn__"), str(text[0])): | |
| 43 # s = text[0].split("__cn__") | |
| 44 # print(s) | |
| 45 # print(text[0]) | |
| 46 s = text[0] | |
| 47 # print(type(s)) | |
| 48 split = s.split("__cn__") | |
| 49 # print(split) | |
| 50 user_terms.extend(split) | |
| 51 else: | |
| 52 user_terms.extend(text) | |
| 53 else: | |
| 54 pass | |
| 55 | |
| 56 return user_terms | |
| 57 | |
| 58 | |
| 59 def glueTerms(dbase_terms, user_terms): | |
| 60 """glue dbaseTerms and userTerms together for eventual query item""" | |
| 61 glued = [] | |
| 62 if dbase_terms: | |
| 63 glued.extend(dbase_terms) | |
| 64 else: | |
| 65 pass | |
| 66 if user_terms: | |
| 67 glued.extend(user_terms) | |
| 68 else: | |
| 69 pass | |
| 70 | |
| 71 return glued | |
| 72 | |
| 73 | |
| 74 ####### FILE FUNCTIONS | |
| 75 def glueFiles(gff, gbk, fa, blast): | |
| 76 """glue files into one list...I think this is a decent way to go about this...#CHECK LATER#...""" | |
| 77 files = [] | |
| 78 gffs = [] | |
| 79 gbks = [] | |
| 80 blasts = [] | |
| 81 if gff: | |
| 82 for gff_file in gff: | |
| 83 gffs.extend(gff_file) | |
| 84 else: | |
| 85 pass | |
| 86 if gbk: | |
| 87 for gbk_file in gbk: | |
| 88 gbks.extend(gbk_file) | |
| 89 # print(gbks) | |
| 90 else: | |
| 91 pass | |
| 92 fas = [] | |
| 93 if fa: | |
| 94 for fa_file in fa: | |
| 95 fas.extend(fa_file) | |
| 96 else: | |
| 97 pass | |
| 98 if blast: | |
| 99 for blast_file in blast: | |
| 100 blasts.extend(blast_file) | |
| 101 else: | |
| 102 pass | |
| 103 files = [gffs, gbks, fas, blasts] | |
| 104 | |
| 105 return files | |
| 106 | |
| 107 | |
| 108 ######## PARSE FILE FUNCTIONS | |
| 109 def readGFF3(files, search_list): | |
| 110 "Searches through gff3 file(s) and appends" | |
| 111 if files: | |
| 112 for idx, file in enumerate(files): | |
| 113 if idx == 0: | |
| 114 print("Parsing - " + file.name) | |
| 115 db = gffutils.create_db( | |
| 116 file.name, dbfn="file.db", force=True, keep_order=False | |
| 117 ) | |
| 118 db = gffutils.FeatureDB("file.db") | |
| 119 features = db.all_features() | |
| 120 gff3_matches = [] | |
| 121 for feature in features: | |
| 122 gff3_matches.extend( | |
| 123 searchInput(str(feature), search_list=search_list) | |
| 124 ) | |
| 125 gff3_matches = list( | |
| 126 set(gff3_matches) | |
| 127 ) # make sure we don't fluff the list | |
| 128 else: | |
| 129 print("Parsing - " + file.name) | |
| 130 db = gffutils.create_db( | |
| 131 file.name, dbfn=str(idx) + "_file.db", force=True, keep_order=False | |
| 132 ) | |
| 133 db = gffutils.FeatureDB(str(idx) + "_file.db") | |
| 134 features = db.all_features() | |
| 135 for feature in features: | |
| 136 gff3_matches.extend( | |
| 137 searchInput(str(feature), search_list=search_list) | |
| 138 ) | |
| 139 gff3_matches = list( | |
| 140 set(gff3_matches) | |
| 141 ) # make sure we don't fluff the list | |
| 142 gff3_matches.sort() | |
| 143 return gff3_matches | |
| 144 else: | |
| 145 pass | |
| 146 | |
| 147 | |
| 148 def readGBK(files, search_list): | |
| 149 if files: | |
| 150 for idx, file in enumerate(files): | |
| 151 if idx == 0: | |
| 152 print("Parsing - " + file.name) | |
| 153 record = SeqIO.read(file.name, "genbank") | |
| 154 gbk_matches = [] | |
| 155 for feature in record.features: | |
| 156 try: | |
| 157 if ( | |
| 158 searchInput( | |
| 159 str(feature.qualifiers["product"]), | |
| 160 search_list=search_list, | |
| 161 ) | |
| 162 or searchInput( | |
| 163 str(feature.qualifiers["note"]), search_list=search_list | |
| 164 ) | |
| 165 or searchInput( | |
| 166 str(feature.qualifiers["dbxref"]), | |
| 167 search_list=search_list, | |
| 168 ) | |
| 169 ): | |
| 170 gbk_matches.extend([str(feature)]) | |
| 171 else: | |
| 172 continue | |
| 173 except KeyError: | |
| 174 continue | |
| 175 gbk_matches = list(set(gbk_matches)) | |
| 176 else: | |
| 177 print("Parsing - " + file.name) | |
| 178 record = SeqIO.read(file.name, "genbank") | |
| 179 for feature in record.features: | |
| 180 try: | |
| 181 if ( | |
| 182 searchInput( | |
| 183 str(feature.qualifiers["product"]), | |
| 184 search_list=search_list, | |
| 185 ) | |
| 186 or searchInput( | |
| 187 str(feature.qualifiers["note"]), search_list=search_list | |
| 188 ) | |
| 189 or searchInput( | |
| 190 str(feature.qualifiers["dbxref"]), | |
| 191 search_list=search_list, | |
| 192 ) | |
| 193 ): | |
| 194 gbk_matches.extend([str(feature)]) | |
| 195 else: | |
| 196 continue | |
| 197 except KeyError: | |
| 198 continue | |
| 199 gbk_matches = list(set(gbk_matches)) | |
| 200 gbk_matches.sort() | |
| 201 return gbk_matches | |
| 202 else: | |
| 203 pass | |
| 204 | |
| 205 | |
| 206 def readFASTA(files, search_list): | |
| 207 if files: | |
| 208 for idx, file in enumerate(files): | |
| 209 if idx == 0: | |
| 210 print("Parsing - " + file.name) | |
| 211 record = SeqIO.parse(file.name, "fasta") | |
| 212 fa_matches = [] | |
| 213 for feature in record: | |
| 214 fa_matches.extend( | |
| 215 searchInput(feature.description, search_list=search_list) | |
| 216 ) | |
| 217 fa_matches = list(set(fa_matches)) | |
| 218 else: | |
| 219 print("Parsing - " + file.name) | |
| 220 record = SeqIO.parse(file.name, "fasta") | |
| 221 for feature in record: | |
| 222 fa_matches.extend( | |
| 223 searchInput(feature.description, search_list=search_list) | |
| 224 ) | |
| 225 fa_matches = list(set(fa_matches)) | |
| 226 fa_matches.sort() | |
| 227 return fa_matches | |
| 228 else: | |
| 229 pass | |
| 230 | |
| 231 | |
| 232 def readBLAST(files, search_list): | |
| 233 if files: | |
| 234 for idx, file in enumerate(files): | |
| 235 if idx == 0: | |
| 236 print("Parsing - " + file.name) | |
| 237 blast_records = NCBIXML.parse(open(file.name)) | |
| 238 blast_matches = [] | |
| 239 for blast_record in blast_records: | |
| 240 for desc in blast_record.descriptions: | |
| 241 pretty = prettifyXML(str(desc)) | |
| 242 for each_ret in pretty: | |
| 243 blast_matches.extend( | |
| 244 searchInput( | |
| 245 each_ret, | |
| 246 search_list=search_list, | |
| 247 blast=True, | |
| 248 q_id=blast_record.query, | |
| 249 ) | |
| 250 ) | |
| 251 blast_matches = list(set(blast_matches)) | |
| 252 else: | |
| 253 print("Parsing - " + file.name) | |
| 254 blast_records = NCBIXML.parse(open(file.name)) | |
| 255 for blast_record in blast_records: | |
| 256 for desc in blast_record.descriptions: | |
| 257 pretty = prettifyXML(str(desc)) | |
| 258 blast_matches.extend( | |
| 259 searchInput( | |
| 260 each_ret, | |
| 261 search_list=search_list, | |
| 262 blast=True, | |
| 263 q_id=blast_record.query, | |
| 264 ) | |
| 265 ) | |
| 266 blast_matches = list(set(blast_matches)) | |
| 267 blast_matches.sort() | |
| 268 return blast_matches | |
| 269 else: | |
| 270 pass | |
| 271 | |
| 272 | |
| 273 ######## SEARCH FILE FUNCTIONS | |
| 274 def searchInput(input, search_list, blast=False, q_id=None): | |
| 275 """Takes an input search string, and returns uniques of passing""" | |
| 276 output = [] | |
| 277 for search_term in search_list: | |
| 278 if blast: | |
| 279 if re.search(re.escape(search_term), input): | |
| 280 add_query = ( | |
| 281 "QueryID: " | |
| 282 + str(q_id) | |
| 283 + "\nSearchQuery: " | |
| 284 + search_term | |
| 285 + "\nMatch: " | |
| 286 + input | |
| 287 + "\n" | |
| 288 ) | |
| 289 output.extend([add_query]) | |
| 290 else: | |
| 291 continue | |
| 292 # print(search_term) | |
| 293 # st = r"\b"+search_term+r"\b" | |
| 294 else: | |
| 295 if re.search(re.escape(search_term), input): | |
| 296 # print(search_term+" -> was found") | |
| 297 output.extend([input]) | |
| 298 else: | |
| 299 continue | |
| 300 return list(set(output)) | |
| 301 | |
| 302 | |
| 303 ######## prettify-XML function | |
| 304 def prettifyXML(input): | |
| 305 """prettifies a string input from a BLAST-xml""" | |
| 306 s = input | |
| 307 split = s.split(">") | |
| 308 | |
| 309 return split | |
| 310 | |
| 311 | |
| 312 ########## Output File Writer | |
| 313 def writeResults(gffs, gbks, fas, blasts, outName="termHits.txt"): | |
| 314 """Takes an input list for each parameter, and writes each result to the output file""" | |
| 315 | |
| 316 with open(outName.name, "w+") as out_file: | |
| 317 if gffs: | |
| 318 | |
| 319 out_file.writelines( | |
| 320 "\n==================== GFF3 Term Hits ====================\n\n" | |
| 321 ) | |
| 322 for gff_hits in gffs: | |
| 323 out_file.writelines(gff_hits + "\n") | |
| 324 else: | |
| 325 gffs = [] | |
| 326 if gbks: | |
| 327 out_file.writelines( | |
| 328 "\n==================== GBK Term Hits ====================\n\n" | |
| 329 ) | |
| 330 for gbk_hits in gbks: | |
| 331 out_file.writelines(gbk_hits + "\n") | |
| 332 else: | |
| 333 gbks = [] | |
| 334 if fas: | |
| 335 | |
| 336 out_file.writelines( | |
| 337 "\n==================== FASTA Term Hits ====================\n\n" | |
| 338 ) | |
| 339 for fa_hits in fas: | |
| 340 out_file.writelines(fa_hits + "\n") | |
| 341 else: | |
| 342 fas = [] | |
| 343 if blasts: | |
| 344 | |
| 345 out_file.writelines( | |
| 346 "\n==================== BLAST Term Hits ====================\n\n" | |
| 347 ) | |
| 348 for blast_hits in blasts: | |
| 349 out_file.writelines(blast_hits + "\n") | |
| 350 else: | |
| 351 blasts = [] | |
| 352 if len(gffs) or len(gbks) or len(fas) or len(blasts): | |
| 353 print("Terms Found") | |
| 354 else: | |
| 355 out_file.writelines("No query matches, try again with new terms!") | |
| 356 print("No query matches, try again with new terms!") | |
| 357 | |
| 358 | |
| 359 def write_gff3(gffs, outName="proxHits.gff3"): | |
| 360 """writes output to gff3 file for prox2lysis pipeline""" | |
| 361 | |
| 362 with open(outName.name, "w+") as out_file: | |
| 363 out_file.writelines("##gff-version 3\n") | |
| 364 if gffs: | |
| 365 for gff_hits in gffs: | |
| 366 out_file.writelines(gff_hits + "\n") | |
| 367 else: | |
| 368 # raise Exception("No terms were found from query set") | |
| 369 out_file.writelines("##No terms were found from query set\n") | |
| 370 | |
| 371 | |
| 372 if __name__ == "__main__": | |
| 373 print(os.getcwd()) | |
| 374 parser = argparse.ArgumentParser( | |
| 375 description="Uses a selection of terms to query an input file for matching cases" | |
| 376 ) | |
| 377 parser.add_argument( | |
| 378 "--dbaseTerms", nargs="*", help="dbase terms to search" | |
| 379 ) # will be a select option, based on KEY within the JSON dbase | |
| 380 parser.add_argument( | |
| 381 "--custom_txt", | |
| 382 nargs="*", | |
| 383 help="custom user input terms, if using Galaxy, terms will be __cn__ sep, otherwise by space", | |
| 384 ) | |
| 385 parser.add_argument( | |
| 386 "--custom_file", | |
| 387 type=argparse.FileType("r"), | |
| 388 help="custom new line separated search term file", | |
| 389 ) | |
| 390 parser.add_argument( | |
| 391 "--gff3_files", | |
| 392 type=argparse.FileType("r"), | |
| 393 nargs="*", | |
| 394 action="append", | |
| 395 help="GFF3 File(s), if multiple files, use another flag", | |
| 396 ) | |
| 397 parser.add_argument( | |
| 398 "--gbk_files", | |
| 399 type=argparse.FileType("r"), | |
| 400 nargs="*", | |
| 401 action="append", | |
| 402 help="GBK File(s), if multiple files, use another flag", | |
| 403 ) | |
| 404 parser.add_argument( | |
| 405 "--fa_files", | |
| 406 type=argparse.FileType("r"), | |
| 407 nargs="*", | |
| 408 action="append", | |
| 409 help="FASTA File(s), if multiple files, use another flag", | |
| 410 ) | |
| 411 parser.add_argument( | |
| 412 "--blast_files", | |
| 413 type=argparse.FileType("r"), | |
| 414 nargs="*", | |
| 415 action="append", | |
| 416 help="BLAST.xml File(s), if multiple files, use another flag", | |
| 417 ) | |
| 418 parser.add_argument( | |
| 419 "--output", type=argparse.FileType("w+"), default="termHits.txt" | |
| 420 ) | |
| 421 parser.add_argument( | |
| 422 "--prox", action="store_true", help="Use when running the prox2lysis pipeline" | |
| 423 ) | |
| 424 args = parser.parse_args() | |
| 425 | |
| 426 ############ STEP I | |
| 427 ##### Determine user's terms to query | |
| 428 dbase_terms = dbaseTerms(terms=args.dbaseTerms, galaxy=True) | |
| 429 user_terms = userTerms(file=args.custom_file, text=args.custom_txt) | |
| 430 glued_terms = glueTerms(dbase_terms=dbase_terms, user_terms=user_terms) | |
| 431 | |
| 432 ############ STEP II | |
| 433 ##### Create list with matches | |
| 434 files = glueFiles( | |
| 435 gff=args.gff3_files, | |
| 436 gbk=args.gbk_files, | |
| 437 fa=args.fa_files, | |
| 438 blast=args.blast_files, | |
| 439 ) | |
| 440 gffs = readGFF3(files=files[0], search_list=glued_terms) | |
| 441 gbks = readGBK(files=files[1], search_list=glued_terms) | |
| 442 fas = readFASTA(files=files[2], search_list=glued_terms) | |
| 443 blasts = readBLAST(files=files[3], search_list=glued_terms) | |
| 444 | |
| 445 ############ STEP III | |
| 446 ##### Output results to a text file or gff3 | |
| 447 if args.prox: | |
| 448 write_gff3(gffs, outName=args.output) | |
| 449 else: | |
| 450 writeResults(gffs, gbks, fas, blasts, outName=args.output) |
