Mercurial > repos > bgruening > erga_ear
diff make_EAR.py @ 3:3dd6be0cd8dd draft default tip
planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit 720787c4fb8885f5127ab6ada2813f8dd580921c
author | bgruening |
---|---|
date | Tue, 15 Oct 2024 12:52:59 +0000 |
parents | a34826ae0a73 |
children |
line wrap: on
line diff
--- a/make_EAR.py Fri Aug 30 09:27:31 2024 +0000 +++ b/make_EAR.py Tue Oct 15 12:52:59 2024 +0000 @@ -21,7 +21,7 @@ # CAUTION: This is for the Galaxy version! # by Diego De Panis # ERGA Sequencing and Assembly Committee -EAR_version = "v24.08.26" +EAR_version = "v24.10.15" def make_report(yaml_file): @@ -36,18 +36,29 @@ try: value_float = float(value) if value_float.is_integer(): - # format as an integer if no decimal part + # format as integer if no decimal return f'{int(value_float):,}' else: # format as a float return f'{value_float:,}' except ValueError: - # return the original value if it can't be converted to a float + # return original value if can't be converted to float return value # extract gfastats values def extract_gfastats_values(content, keys): - return [re.findall(f"{key}: (.+)", content)[0] for key in keys] + values = [] + for key in keys: + # colon-separated as default format first + match = re.search(rf"{re.escape(key)}:\s*(.+)", content) + if not match: + # If not try galaxy's tab-separated + match = re.search(rf"{re.escape(key)}\t(.+)", content) + if match: + values.append(match.group(1).strip()) + else: + values.append("N/A") + return values keys = [ "Total scaffold length", @@ -79,9 +90,17 @@ def extract_total_bp_from_gfastats(gfastats_path): with open(gfastats_path, "r") as f: content = f.read() - total_bp = re.search(r"Total scaffold length: (.+)", content).group(1) - total_bp = int(total_bp.replace(',', '')) - return "{:,}".format(total_bp) + # Try colon-separated format first + match = re.search(r"Total scaffold length:\s*(.+)", content) + if not match: + # If not found, try tab-separated format + match = re.search(r"Total scaffold length\t(.+)", content) + if match: + total_bp = match.group(1).replace(',', '') + return "{:,}".format(int(total_bp)) + else: + logging.error(f"Could not find Total scaffold length in {gfastats_path}") + return "N/A" # compute EBP quality metric def compute_ebp_metric(haplotype, gfastats_path, qv_value): @@ -93,7 +112,6 @@ values = extract_gfastats_values(content, keys_needed) contig_n50_log = math.floor(math.log10(int(values[0].replace(',', '')))) scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', '')))) - return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}" # extract qv values @@ -151,6 +169,8 @@ def extract_busco_info(file_path): busco_version = None lineage_info = None + busco_mode = None + busco_pred = None try: with open(file_path, 'r') as file: @@ -158,18 +178,20 @@ version_match = re.search(r"# BUSCO version is: ([\d.]+)", content) if version_match: busco_version = version_match.group(1) - lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content) + lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of (genomes|species): (\d+), number of BUSCOs: (\d+)\)", content) if lineage_match: - lineage_info = lineage_match.groups() - if not lineage_info: - lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content) - if lineage_match: - lineage_info = lineage_match.groups() + lineage_info = (lineage_match.group(1), lineage_match.group(3), lineage_match.group(4)) + mode_match = re.search(r"# BUSCO was run in mode: (\w+)", content) + if mode_match: + busco_mode = mode_match.group(1) + pred_match = re.search(r"# Gene predictor used: (\w+)", content) + if pred_match: + busco_pred = pred_match.group(1) except Exception as e: logging.warning(f"Error reading {file_path}: {str(e)}") - return busco_version, lineage_info + return busco_version, lineage_info, busco_mode, busco_pred # Function to check and generate warning messages def generate_warning_paragraphs(expected, observed, trait): @@ -195,6 +217,7 @@ paragraphs.append(Paragraph(message, styles["midiStyle"])) except Exception as e: logging.warning(f"Error in generating warning for {trait}: {str(e)}") + return paragraphs # Generate warnings for curated haplotypes (qv, kcomp, busco) @@ -465,19 +488,29 @@ if isinstance(haplotype_properties, dict): if 'gfastats--nstar-report_txt' in haplotype_properties: file_path = haplotype_properties['gfastats--nstar-report_txt'] - with open(file_path, 'r') as file: - content = file.read() - gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys) + try: + with open(file_path, 'r') as file: + content = file.read() + gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys) + except FileNotFoundError: + logging.error(f"Gfastats file not found: {file_path}") + except Exception as e: + logging.error(f"Error processing gfastats file {file_path}: {str(e)}") gaps_per_gbp_data = {} for (asm_stage, haplotypes), values in gfastats_data.items(): try: - gaps = float(values[gaps_index]) - total_length = float(values[total_length_index]) - gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2) - gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp - except (ValueError, ZeroDivisionError): - gaps_per_gbp_data[(asm_stage, haplotypes)] = '' + gaps = float(values[gaps_index].replace(',', '')) + total_length = float(values[total_length_index].replace(',', '')) + if total_length > 0: + gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2) + gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp + else: + logging.warning(f"Total length is zero for {asm_stage} {haplotypes}") + gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A' + except (ValueError, IndexError): + logging.warning(f"Could not calculate gaps per Gbp for {asm_stage} {haplotypes}") + gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A' # Define the contigging table (column names) asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]] @@ -486,11 +519,10 @@ for i in range(len(display_names)): metric = display_names[i] if metric not in exclusion_list: - asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) + asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), ['N/A'] * len(keys))[i]) if (asm_stage, haplotypes) in gfastats_data else 'N/A' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) # Add the gaps/gbp in between - asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) - + asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), 'N/A')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) # get QV, Kmer completeness and BUSCO data qv_data = {} completeness_data = {} @@ -747,23 +779,31 @@ # Spacer elements.append(Spacer(1, 5)) - # Store BUSCO version and lineage information from each file in list + # Store BUSCO information from each file in a list busco_info_list = [] for asm_stages, stage_properties in asm_data.items(): for i, haplotype_properties in stage_properties.items(): if isinstance(haplotype_properties, dict): if 'busco_short_summary_txt' in haplotype_properties: - busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt']) - if busco_version and lineage_info: - busco_info_list.append((busco_version, lineage_info)) + busco_info = extract_busco_info(haplotype_properties['busco_short_summary_txt']) + if all(busco_info): + busco_info_list.append(busco_info) + + # Function to format BUSCO information + def format_busco_info(info): + version, (lineage, genomes, buscos), mode, predictor = info + return f"BUSCO: {version} ({mode}, {predictor}) / Lineage: {lineage} (genomes:{genomes}, BUSCOs:{buscos})" # Checking if all elements in the list are identical - if all(info == busco_info_list[0] for info in busco_info_list): - busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0] - elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle'])) + if busco_info_list and all(info == busco_info_list[0] for info in busco_info_list): + busco_text = format_busco_info(busco_info_list[0]) + elements.append(Paragraph(busco_text, styles['miniStyle'])) else: - elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle'])) - logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results") + elements.append(Paragraph("Warning! BUSCO versions or lineage datasets are not the same across results:", styles['miniStyle'])) + logging.warning("WARNING: BUSCO versions or lineage datasets are not the same across results") + for info in busco_info_list: + busco_text = format_busco_info(info) + elements.append(Paragraph(busco_text, styles['miniStyle'])) # Page break elements.append(PageBreak()) @@ -806,7 +846,7 @@ # Add paragraph for the link if link: - link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>' + link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[non-permanent LINK]</link>' else: link_html = f'<b>{haplotype}</b> File link is missing!' @@ -852,93 +892,75 @@ curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) # Get paths for spectra files - spectra_files = { - 'hap1': { - 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None), - }, - 'hap2': { - 'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None), - }, - 'common': { - 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None), - 'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None), - } - } - - # Filter out None values and empty strings - spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()} + spectra_files = {'common': {}} + for assembly_type, assembly_data in curated_assemblies.items(): + if 'merqury_hap_spectra_cn_png' in assembly_data: + spectra_files[assembly_type] = {'spectra_cn_png': assembly_data['merqury_hap_spectra_cn_png']} + if 'merqury_spectra_cn_png' in assembly_data: + spectra_files['common']['spectra_cn_png'] = assembly_data['merqury_spectra_cn_png'] + if 'merqury_spectra_asm_png' in assembly_data: + spectra_files['common']['spectra_asm_png'] = assembly_data['merqury_spectra_asm_png'] # Determine the number of spectra-cn files and assign unique names if needed spectra_cn_files = [ - spectra_files['common'].get('spectra_cn_png', None), - spectra_files['hap1'].get('spectra_cn_png', None), - spectra_files['hap2'].get('spectra_cn_png', None) + file_dict.get('spectra_cn_png', None) + for file_dict in spectra_files.values() + if file_dict.get('spectra_cn_png') ] - spectra_cn_files = [f for f in spectra_cn_files if f] # Filter out None values + spectra_cn_files = list(set(spectra_cn_files)) # Remove duplicates if len(spectra_cn_files) == 3: - # For 3 spectra-cn files shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None) similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file] if similar_files: unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1])) else: shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None - unique_name1 = unique_name2 = None + # unique_name1 = unique_name2 = None # Create image objects and add filename below each image images = [] - for label, file_dict in spectra_files.items(): for key, png_file in file_dict.items(): - if png_file: - image = Image(png_file, width=8.4 * cm, height=7 * cm) - filename = os.path.basename(png_file) + if png_file and os.path.exists(png_file): + try: + image = Image(png_file, width=8.4 * cm, height=7 * cm) + filename = os.path.basename(png_file) - if filename.endswith("spectra-asm.ln.png"): - text = "Distribution of k-mer counts coloured by their presence in reads/assemblies" - elif filename.endswith("spectra-cn.ln.png"): - if len(spectra_cn_files) == 3: - # For 3 spectra-cn files use particular text - if png_file == shortest_spectra_cn_file: - text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)" + if filename.endswith("spectra-asm.ln.png"): + text = "Distribution of k-mer counts coloured by their presence in reads/assemblies" + elif filename.endswith("spectra-cn.ln.png"): + if len(spectra_cn_files) == 3: + if png_file == shortest_spectra_cn_file: + text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)" + else: + text = f"Distribution of k-mer counts per copy numbers found in {label} (hapl.)" else: - if png_file == spectra_files['hap1'].get('spectra_cn_png', None): - text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)" - elif png_file == spectra_files['hap2'].get('spectra_cn_png', None): - text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)" - else: - text = "Distribution of k-mer counts per copy numbers found in asm" + text = "Distribution of k-mer counts per copy numbers found in asm" else: - # For 2 spectra-cn files use same text - text = "Distribution of k-mer counts per copy numbers found in asm" - else: - text = filename - - images.append([image, Paragraph(text, styles["midiStyle"])]) + text = filename - # Filter None values - images = [img for img in images if img[0] is not None] - - # Get number of rows and columns for the table - num_rows = (len(images) + 1) // 2 # +1 to handle odd numbers of images - num_columns = 2 + images.append([image, Paragraph(text, styles["midiStyle"])]) + except Exception as e: + logging.error(f"Error processing image {png_file}: {str(e)}") # Create the table with dynamic size - image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)] - image_table = Table(image_table_data) + if images: + num_rows = (len(images) + 1) // 2 + num_columns = 2 + image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)] + image_table = Table(image_table_data) - # Style the "table" - table_style = TableStyle([ - ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), - ('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows - ]) + # Style the table + table_style = TableStyle([ + ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), + ('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows + ]) - # Set the style - image_table.setStyle(table_style) - - # Add image table to elements - elements.append(image_table) + image_table.setStyle(table_style) + elements.append(image_table) + else: + elements.append(Paragraph("No K-mer spectra images available.", styles["midiStyle"])) # Increase counter by the number of PNGs added counter += len(images)