Mercurial > repos > iuc > virannot_rps2tsv
comparison otu.py @ 4:998724a43694 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 7036ce0e06b6dc64332b1a5642fc58928523c5c6
| author | iuc |
|---|---|
| date | Tue, 13 May 2025 11:52:07 +0000 |
| parents | d1fd5579469d |
| children |
comparison
equal
deleted
inserted
replaced
| 3:d1fd5579469d | 4:998724a43694 |
|---|---|
| 2 | 2 |
| 3 | 3 |
| 4 # Name: virAnnot_otu | 4 # Name: virAnnot_otu |
| 5 # Author: Marie Lefebvre - INRAE | 5 # Author: Marie Lefebvre - INRAE |
| 6 # Reuirements: Ete3 toolkit and external apps | 6 # Reuirements: Ete3 toolkit and external apps |
| 7 # Aims: Create viral OTUs based on RPS and Blast annotations | 7 |
| 8 | 8 """Create viral OTUs based on RPS and Blast annotations""" |
| 9 | 9 |
| 10 import argparse | 10 import argparse |
| 11 import csv | 11 import csv |
| 12 import logging as log | 12 import logging as log |
| 13 import os | 13 import os |
| 63 startQ = int(row[5]) | 63 startQ = int(row[5]) |
| 64 endQ = int(row[6]) | 64 endQ = int(row[6]) |
| 65 frame = float(row[7]) | 65 frame = float(row[7]) |
| 66 description = row[8] | 66 description = row[8] |
| 67 superkingdom = row[9] | 67 superkingdom = row[9] |
| 68 try: | |
| 69 pident = row[10] | |
| 70 except IndexError: | |
| 71 log.info(rps_file[0]) | |
| 72 log.info(row) | |
| 68 match = re.search("Viruses", superkingdom) | 73 match = re.search("Viruses", superkingdom) |
| 69 # if contig is viral then retrieve sequence | 74 # if contig is viral then retrieve sequence |
| 70 if match: | 75 if match and float(pident) >= options.viral_portion: |
| 71 options.fasta.sort() | 76 options.fasta.sort() |
| 72 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) | 77 seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) |
| 73 seq_length = len(seq) | 78 seq_length = len(seq) |
| 74 if endQ < seq_length: | 79 if endQ < seq_length: |
| 75 seq = seq[startQ - 1:endQ] | 80 seq = seq[startQ - 1:endQ] |
| 101 if "nb" not in collection[cdd_id][query_id]: | 106 if "nb" not in collection[cdd_id][query_id]: |
| 102 collection[cdd_id][query_id]["nb"] = 0 | 107 collection[cdd_id][query_id]["nb"] = 0 |
| 103 if "taxonomy" not in collection[cdd_id][query_id]: | 108 if "taxonomy" not in collection[cdd_id][query_id]: |
| 104 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | 109 collection[cdd_id][query_id]["taxonomy"] = "Unknown" |
| 105 else: | 110 else: |
| 106 log.info("No blast file") | 111 log.debug("No blast file") |
| 107 collection[cdd_id][query_id]["taxonomy"] = "Unknown" | 112 collection[cdd_id][query_id]["taxonomy"] = "Unknown" |
| 108 collection[cdd_id][query_id]["nb"] = 0 | 113 collection[cdd_id][query_id]["nb"] = 0 |
| 109 | 114 # keep pfamXXX and RdRp 1 |
| 110 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] # keep pfamXXX and RdRp 1 | 115 collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] |
| 111 collection[cdd_id]["full_description"] = description | 116 collection[cdd_id]["full_description"] = description |
| 112 i += 1 | 117 i += 1 |
| 118 if options.merge_rdrp == "yes": | |
| 119 rdrp_list = ["pfam00680", "pfam02123", "pfam00978", "pfam00998"] | |
| 120 collection["RdRp_merge"] = {} | |
| 121 for cdd_id in collection: | |
| 122 if cdd_id in rdrp_list and cdd_id != "RdRp_merge": | |
| 123 log.info("Add " + cdd_id + " in merge") | |
| 124 for query_id in collection[cdd_id]: | |
| 125 if query_id not in collection["RdRp_merge"]: | |
| 126 collection["RdRp_merge"][query_id] = {} | |
| 127 collection["RdRp_merge"][query_id] = collection[cdd_id][query_id] | |
| 113 return collection | 128 return collection |
| 114 | 129 |
| 115 | 130 |
| 116 def _retrieve_fasta_seq(fasta_file, query_id): | 131 def _retrieve_fasta_seq(fasta_file, query_id): |
| 117 """ | 132 """ |
| 179 log.info("Align sequences") | 194 log.info("Align sequences") |
| 180 if not os.path.exists(options.output): | 195 if not os.path.exists(options.output): |
| 181 os.mkdir(options.output) | 196 os.mkdir(options.output) |
| 182 color_by_sample = {} | 197 color_by_sample = {} |
| 183 for cdd_id in hits_collection: | 198 for cdd_id in hits_collection: |
| 184 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | 199 log.info("align seq for " + cdd_id) |
| 200 if cdd_id == "RdRp_merge": | |
| 201 cdd_output = options.output + "/" + cdd_id | |
| 202 else: | |
| 203 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
| 185 if not os.path.exists(cdd_output): | 204 if not os.path.exists(cdd_output): |
| 186 os.mkdir(cdd_output) | 205 os.mkdir(cdd_output) |
| 187 if os.path.exists(cdd_output + "/seq_to_align.fasta"): | 206 if os.path.exists(cdd_output + "/seq_to_align.fasta"): |
| 188 os.remove(cdd_output + "/seq_to_align.fasta") | 207 os.remove(cdd_output + "/seq_to_align.fasta") |
| 189 if os.path.exists(cdd_output + "/seq_nucc.fasta"): | 208 if os.path.exists(cdd_output + "/seq_nucc.fasta"): |
| 221 | 240 |
| 222 # create tree plot with colors | 241 # create tree plot with colors |
| 223 file_matrix = cdd_output + "/identity_matrix.csv" | 242 file_matrix = cdd_output + "/identity_matrix.csv" |
| 224 log.info("Create tree...") | 243 log.info("Create tree...") |
| 225 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) | 244 _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) |
| 226 _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id) | 245 _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id) |
| 227 log.info("Retrieve OTUs...") | 246 log.info("Retrieve OTUs...") |
| 228 # if os.path.exists(file_cluster): | 247 # if os.path.exists(file_cluster): |
| 229 # os.remove(file_cluster) | 248 # os.remove(file_cluster) |
| 230 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc) | 249 otu_cmd = os.path.join(options.tool_path, 'seek_otu.R') + ' ' + file_matrix + ' ' + file_cluster + ' ' + str(options.perc) |
| 231 log.debug(otu_cmd) | 250 log.debug(otu_cmd) |
| 239 f = open(file_cluster, "w+") | 258 f = open(file_cluster, "w+") |
| 240 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',') | 259 f.write('OTU_1,1,' + list(hits_collection[cdd_id].keys())[0] + ',') |
| 241 f.close() | 260 f.close() |
| 242 | 261 |
| 243 | 262 |
| 244 def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id): | 263 def _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id): |
| 245 """ | 264 """ |
| 246 Calculate paiwise distance between aligned protein sequences | 265 Calculate paiwise distance between aligned protein sequences |
| 247 from a cdd_id | 266 from a cdd_id |
| 248 """ | 267 """ |
| 249 log.info("Compute pairwise distance of " + cdd_id) | 268 log.info("Compute pairwise distance of " + cdd_id) |
| 295 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook | 314 file_xlsx = options.output + '/otu_stats.xlsx' # Create a workbook |
| 296 workbook = xlsxwriter.Workbook(file_xlsx) | 315 workbook = xlsxwriter.Workbook(file_xlsx) |
| 297 log.info("Writing stats to " + file_xlsx) | 316 log.info("Writing stats to " + file_xlsx) |
| 298 for cdd_id in hits_collection: | 317 for cdd_id in hits_collection: |
| 299 otu_collection = {} | 318 otu_collection = {} |
| 300 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | 319 if cdd_id == "RdRp_merge": |
| 301 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet | 320 cdd_output = options.output + "/" + cdd_id |
| 321 worksheet = workbook.add_worksheet(cdd_id) | |
| 322 else: | |
| 323 | |
| 324 cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
| 325 worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet | |
| 302 file_cluster = cdd_output + '/otu_cluster.csv' | 326 file_cluster = cdd_output + '/otu_cluster.csv' |
| 303 file_fasta_nucc = cdd_output + '/representative_nucc.fasta' | 327 file_fasta_nucc = cdd_output + '/representative_nucc.fasta' |
| 304 with open(file_cluster, 'r') as clust: | 328 with open(file_cluster, 'r') as clust: |
| 305 otu_reader = csv.reader(clust, delimiter=',') | 329 otu_reader = csv.reader(clust, delimiter=',') |
| 306 samples_list = [] | 330 samples_list = [] |
| 313 samples_list.append(sample) if sample not in samples_list else samples_list | 337 samples_list.append(sample) if sample not in samples_list else samples_list |
| 314 if sample not in otu_collection[row[0]]: | 338 if sample not in otu_collection[row[0]]: |
| 315 otu_collection[row[0]][sample] = {} | 339 otu_collection[row[0]][sample] = {} |
| 316 otu_collection[row[0]][sample][contig] = {} | 340 otu_collection[row[0]][sample][contig] = {} |
| 317 # add read number of the contig and annotation | 341 # add read number of the contig and annotation |
| 318 if 'nb' in hits_collection[cdd_id][contig]: | 342 if contig in hits_collection[cdd_id]: |
| 319 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | 343 if 'nb' in hits_collection[cdd_id][contig]: |
| 344 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | |
| 345 else: | |
| 346 otu_collection[row[0]][sample][contig]['nb'] = 0 | |
| 347 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 348 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 349 else: | |
| 350 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
| 320 else: | 351 else: |
| 321 otu_collection[row[0]][sample][contig]['nb'] = 0 | 352 otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'} |
| 322 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 323 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 324 else: | |
| 325 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
| 326 else: | 353 else: |
| 327 otu_collection[row[0]][sample][contig] = {} | 354 otu_collection[row[0]][sample][contig] = {} |
| 328 # add read number of the contig and annotation | 355 # add read number of the contig and annotation |
| 329 if 'nb' in hits_collection[cdd_id][contig]: | 356 if contig in hits_collection[cdd_id]: |
| 330 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | 357 if 'nb' in hits_collection[cdd_id][contig]: |
| 358 otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] | |
| 359 else: | |
| 360 otu_collection[row[0]][sample][contig]['nb'] = 0 | |
| 361 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 362 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 363 else: | |
| 364 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
| 331 else: | 365 else: |
| 332 otu_collection[row[0]][sample][contig]['nb'] = 0 | 366 otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'} |
| 333 if 'taxonomy' in hits_collection[cdd_id][contig]: | |
| 334 otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | |
| 335 else: | |
| 336 otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' | |
| 337 if 'taxonomy' in hits_collection[cdd_id][contig]: | 367 if 'taxonomy' in hits_collection[cdd_id][contig]: |
| 338 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] | 368 otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] |
| 339 else: | 369 else: |
| 340 otu_collection[row[0]]['global_taxonomy'] = 'unknown' | 370 otu_collection[row[0]]['global_taxonomy'] = 'unknown' |
| 341 | 371 |
| 360 worksheet.write(row, column + 2, 'contigs_list') | 390 worksheet.write(row, column + 2, 'contigs_list') |
| 361 row = 1 | 391 row = 1 |
| 362 # column = 0 | 392 # column = 0 |
| 363 with open(file_fasta_nucc, "w+") as f_nucc: | 393 with open(file_fasta_nucc, "w+") as f_nucc: |
| 364 for otu in otu_collection: | 394 for otu in otu_collection: |
| 365 log.info(otu) | |
| 366 if isinstance(otu_collection[otu], dict): | 395 if isinstance(otu_collection[otu], dict): |
| 367 column = 0 | 396 column = 0 |
| 368 worksheet.write(row, column, otu) | 397 worksheet.write(row, column, otu) |
| 369 # prepare table with 0 in each cells | 398 # prepare table with 0 in each cells |
| 370 for sample in otu_collection[otu]: | 399 for sample in otu_collection[otu]: |
| 403 | 432 |
| 404 with open(map_file_path, "w+") as map_file: | 433 with open(map_file_path, "w+") as map_file: |
| 405 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] | 434 headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] |
| 406 map_file.write("\t".join(headers)) | 435 map_file.write("\t".join(headers)) |
| 407 for cdd_id in hits_collection: | 436 for cdd_id in hits_collection: |
| 408 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") | 437 if cdd_id == "RdRp_merge": |
| 438 cdd_output = "RdRp_merge" | |
| 439 else: | |
| 440 cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") | |
| 409 short_description = cdd_output | 441 short_description = cdd_output |
| 410 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' | 442 file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' |
| 411 tree_file = cdd_output + '/tree.dnd.png' | 443 tree_file = cdd_output + '/tree.dnd.png' |
| 412 file_cluster = cdd_output + '/otu_cluster.csv' | 444 file_cluster = cdd_output + '/otu_cluster.csv' |
| 413 file_matrix = cdd_output + "/identity_matrix.csv" | 445 file_matrix = cdd_output + "/identity_matrix.csv" |
| 420 log.debug(html_cmd) | 452 log.debug(html_cmd) |
| 421 os.system(html_cmd) | 453 os.system(html_cmd) |
| 422 | 454 |
| 423 | 455 |
| 424 def _set_options(): | 456 def _set_options(): |
| 457 """ | |
| 458 Set parameters | |
| 459 """ | |
| 425 parser = argparse.ArgumentParser() | 460 parser = argparse.ArgumentParser() |
| 426 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') | 461 parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') |
| 427 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') | 462 parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') |
| 428 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+') | 463 parser.add_argument('-f', '--fasta', help='FASTA file with contigs', action='append', required=True, dest='fasta', nargs='+') |
| 429 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') | 464 parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') |
| 430 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') | 465 parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') |
| 431 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') | 466 parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') |
| 467 parser.add_argument('-m', '--merge_rdrp', help='Merge RdRp1, 2, 3 and 4 to create otu on it.', action='store', type=str, default="no", dest='merge_rdrp') | |
| 432 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') | 468 parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') |
| 433 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') | 469 parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') |
| 434 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') | 470 parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') |
| 435 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | 471 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) |
| 436 args = parser.parse_args() | 472 args = parser.parse_args() |
| 437 return args | 473 return args |
| 438 | 474 |
| 439 | 475 |
| 440 def _set_log_level(verbosity): | 476 def _set_log_level(verbosity): |
| 477 """ | |
| 478 Debbug | |
| 479 """ | |
| 441 if verbosity == 1: | 480 if verbosity == 1: |
| 442 log_format = '%(asctime)s %(levelname)-8s %(message)s' | 481 log_format = '%(asctime)s %(levelname)-8s %(message)s' |
| 443 log.basicConfig(level=log.INFO, format=log_format) | 482 log.basicConfig(level=log.INFO, format=log_format) |
| 444 elif verbosity == 3: | 483 elif verbosity == 3: |
| 445 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | 484 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' |
