diff make_EAR.py @ 3:3dd6be0cd8dd draft default tip

planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit 720787c4fb8885f5127ab6ada2813f8dd580921c
author bgruening
date Tue, 15 Oct 2024 12:52:59 +0000
parents a34826ae0a73
children
line wrap: on
line diff
--- a/make_EAR.py	Fri Aug 30 09:27:31 2024 +0000
+++ b/make_EAR.py	Tue Oct 15 12:52:59 2024 +0000
@@ -21,7 +21,7 @@
 # CAUTION: This is for the Galaxy version!
 # by Diego De Panis
 # ERGA Sequencing and Assembly Committee
-EAR_version = "v24.08.26"
+EAR_version = "v24.10.15"
 
 
 def make_report(yaml_file):
@@ -36,18 +36,29 @@
         try:
             value_float = float(value)
             if value_float.is_integer():
-                # format as an integer if no decimal part
+                # format as integer if no decimal
                 return f'{int(value_float):,}'
             else:
                 # format as a float
                 return f'{value_float:,}'
         except ValueError:
-            # return the original value if it can't be converted to a float
+            # return original value if can't be converted to float
             return value
 
     # extract gfastats values
     def extract_gfastats_values(content, keys):
-        return [re.findall(f"{key}: (.+)", content)[0] for key in keys]
+        values = []
+        for key in keys:
+            # colon-separated as default format first
+            match = re.search(rf"{re.escape(key)}:\s*(.+)", content)
+            if not match:
+                # If not try galaxy's tab-separated
+                match = re.search(rf"{re.escape(key)}\t(.+)", content)
+            if match:
+                values.append(match.group(1).strip())
+            else:
+                values.append("N/A")
+        return values
 
     keys = [
         "Total scaffold length",
@@ -79,9 +90,17 @@
     def extract_total_bp_from_gfastats(gfastats_path):
         with open(gfastats_path, "r") as f:
             content = f.read()
-        total_bp = re.search(r"Total scaffold length: (.+)", content).group(1)
-        total_bp = int(total_bp.replace(',', ''))
-        return "{:,}".format(total_bp)
+        # Try colon-separated format first
+        match = re.search(r"Total scaffold length:\s*(.+)", content)
+        if not match:
+            # If not found, try tab-separated format
+            match = re.search(r"Total scaffold length\t(.+)", content)
+        if match:
+            total_bp = match.group(1).replace(',', '')
+            return "{:,}".format(int(total_bp))
+        else:
+            logging.error(f"Could not find Total scaffold length in {gfastats_path}")
+            return "N/A"
 
     # compute EBP quality metric
     def compute_ebp_metric(haplotype, gfastats_path, qv_value):
@@ -93,7 +112,6 @@
         values = extract_gfastats_values(content, keys_needed)
         contig_n50_log = math.floor(math.log10(int(values[0].replace(',', ''))))
         scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', ''))))
-
         return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}"
 
     # extract qv values
@@ -151,6 +169,8 @@
     def extract_busco_info(file_path):
         busco_version = None
         lineage_info = None
+        busco_mode = None
+        busco_pred = None
 
         try:
             with open(file_path, 'r') as file:
@@ -158,18 +178,20 @@
                 version_match = re.search(r"# BUSCO version is: ([\d.]+)", content)
                 if version_match:
                     busco_version = version_match.group(1)
-                lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content)
+                lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of (genomes|species): (\d+), number of BUSCOs: (\d+)\)", content)
                 if lineage_match:
-                    lineage_info = lineage_match.groups()
-                if not lineage_info:
-                    lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content)
-                    if lineage_match:
-                        lineage_info = lineage_match.groups()
+                    lineage_info = (lineage_match.group(1), lineage_match.group(3), lineage_match.group(4))
+                mode_match = re.search(r"# BUSCO was run in mode: (\w+)", content)
+                if mode_match:
+                    busco_mode = mode_match.group(1)
+                pred_match = re.search(r"# Gene predictor used: (\w+)", content)
+                if pred_match:
+                    busco_pred = pred_match.group(1)
 
         except Exception as e:
             logging.warning(f"Error reading {file_path}: {str(e)}")
 
-        return busco_version, lineage_info
+        return busco_version, lineage_info, busco_mode, busco_pred
 
     # Function to check and generate warning messages
     def generate_warning_paragraphs(expected, observed, trait):
@@ -195,6 +217,7 @@
                     paragraphs.append(Paragraph(message, styles["midiStyle"]))
         except Exception as e:
             logging.warning(f"Error in generating warning for {trait}: {str(e)}")
+
         return paragraphs
 
     # Generate warnings for curated haplotypes (qv, kcomp, busco)
@@ -465,19 +488,29 @@
             if isinstance(haplotype_properties, dict):
                 if 'gfastats--nstar-report_txt' in haplotype_properties:
                     file_path = haplotype_properties['gfastats--nstar-report_txt']
-                    with open(file_path, 'r') as file:
-                        content = file.read()
-                    gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
+                    try:
+                        with open(file_path, 'r') as file:
+                            content = file.read()
+                        gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
+                    except FileNotFoundError:
+                        logging.error(f"Gfastats file not found: {file_path}")
+                    except Exception as e:
+                        logging.error(f"Error processing gfastats file {file_path}: {str(e)}")
 
     gaps_per_gbp_data = {}
     for (asm_stage, haplotypes), values in gfastats_data.items():
         try:
-            gaps = float(values[gaps_index])
-            total_length = float(values[total_length_index])
-            gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
-            gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
-        except (ValueError, ZeroDivisionError):
-            gaps_per_gbp_data[(asm_stage, haplotypes)] = ''
+            gaps = float(values[gaps_index].replace(',', ''))
+            total_length = float(values[total_length_index].replace(',', ''))
+            if total_length > 0:
+                gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
+                gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
+            else:
+                logging.warning(f"Total length is zero for {asm_stage} {haplotypes}")
+                gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A'
+        except (ValueError, IndexError):
+            logging.warning(f"Could not calculate gaps per Gbp for {asm_stage} {haplotypes}")
+            gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A'
 
     # Define the contigging table (column names)
     asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]
@@ -486,11 +519,10 @@
     for i in range(len(display_names)):
         metric = display_names[i]
         if metric not in exclusion_list:
-            asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
+            asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), ['N/A'] * len(keys))[i]) if (asm_stage, haplotypes) in gfastats_data else 'N/A' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 
     # Add the gaps/gbp in between
-    asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
-
+    asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), 'N/A')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
     # get QV, Kmer completeness and BUSCO data
     qv_data = {}
     completeness_data = {}
@@ -747,23 +779,31 @@
     # Spacer
     elements.append(Spacer(1, 5))
 
-    # Store BUSCO version and lineage information from each file in list
+    # Store BUSCO information from each file in a list
     busco_info_list = []
     for asm_stages, stage_properties in asm_data.items():
         for i, haplotype_properties in stage_properties.items():
             if isinstance(haplotype_properties, dict):
                 if 'busco_short_summary_txt' in haplotype_properties:
-                    busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
-                    if busco_version and lineage_info:
-                        busco_info_list.append((busco_version, lineage_info))
+                    busco_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
+                    if all(busco_info):
+                        busco_info_list.append(busco_info)
+
+    # Function to format BUSCO information
+    def format_busco_info(info):
+        version, (lineage, genomes, buscos), mode, predictor = info
+        return f"BUSCO: {version} ({mode}, {predictor}) / Lineage: {lineage} (genomes:{genomes}, BUSCOs:{buscos})"
 
     # Checking if all elements in the list are identical
-    if all(info == busco_info_list[0] for info in busco_info_list):
-        busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0]
-        elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle']))
+    if busco_info_list and all(info == busco_info_list[0] for info in busco_info_list):
+        busco_text = format_busco_info(busco_info_list[0])
+        elements.append(Paragraph(busco_text, styles['miniStyle']))
     else:
-        elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle']))
-        logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results")
+        elements.append(Paragraph("Warning! BUSCO versions or lineage datasets are not the same across results:", styles['miniStyle']))
+        logging.warning("WARNING: BUSCO versions or lineage datasets are not the same across results")
+        for info in busco_info_list:
+            busco_text = format_busco_info(info)
+            elements.append(Paragraph(busco_text, styles['miniStyle']))
 
     # Page break
     elements.append(PageBreak())
@@ -806,7 +846,7 @@
 
                 # Add paragraph for the link
                 if link:
-                    link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>'
+                    link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[non-permanent LINK]</link>'
                 else:
                     link_html = f'<b>{haplotype}</b> File link is missing!'
 
@@ -852,93 +892,75 @@
     curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
 
     # Get paths for spectra files
-    spectra_files = {
-        'hap1': {
-            'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None),
-        },
-        'hap2': {
-            'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None),
-        },
-        'common': {
-            'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None),
-            'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None),
-        }
-    }
-
-    # Filter out None values and empty strings
-    spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()}
+    spectra_files = {'common': {}}
+    for assembly_type, assembly_data in curated_assemblies.items():
+        if 'merqury_hap_spectra_cn_png' in assembly_data:
+            spectra_files[assembly_type] = {'spectra_cn_png': assembly_data['merqury_hap_spectra_cn_png']}
+        if 'merqury_spectra_cn_png' in assembly_data:
+            spectra_files['common']['spectra_cn_png'] = assembly_data['merqury_spectra_cn_png']
+        if 'merqury_spectra_asm_png' in assembly_data:
+            spectra_files['common']['spectra_asm_png'] = assembly_data['merqury_spectra_asm_png']
 
     # Determine the number of spectra-cn files and assign unique names if needed
     spectra_cn_files = [
-        spectra_files['common'].get('spectra_cn_png', None),
-        spectra_files['hap1'].get('spectra_cn_png', None),
-        spectra_files['hap2'].get('spectra_cn_png', None)
+        file_dict.get('spectra_cn_png', None)
+        for file_dict in spectra_files.values()
+        if file_dict.get('spectra_cn_png')
     ]
-    spectra_cn_files = [f for f in spectra_cn_files if f]  # Filter out None values
+    spectra_cn_files = list(set(spectra_cn_files))  # Remove duplicates
 
     if len(spectra_cn_files) == 3:
-        # For 3 spectra-cn files
         shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None)
         similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file]
         if similar_files:
             unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1]))
     else:
         shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None
-        unique_name1 = unique_name2 = None
+        # unique_name1 = unique_name2 = None
 
     # Create image objects and add filename below each image
     images = []
-
     for label, file_dict in spectra_files.items():
         for key, png_file in file_dict.items():
-            if png_file:
-                image = Image(png_file, width=8.4 * cm, height=7 * cm)
-                filename = os.path.basename(png_file)
+            if png_file and os.path.exists(png_file):
+                try:
+                    image = Image(png_file, width=8.4 * cm, height=7 * cm)
+                    filename = os.path.basename(png_file)
 
-                if filename.endswith("spectra-asm.ln.png"):
-                    text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
-                elif filename.endswith("spectra-cn.ln.png"):
-                    if len(spectra_cn_files) == 3:
-                        # For 3 spectra-cn files use particular text
-                        if png_file == shortest_spectra_cn_file:
-                            text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)"
+                    if filename.endswith("spectra-asm.ln.png"):
+                        text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
+                    elif filename.endswith("spectra-cn.ln.png"):
+                        if len(spectra_cn_files) == 3:
+                            if png_file == shortest_spectra_cn_file:
+                                text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)"
+                            else:
+                                text = f"Distribution of k-mer counts per copy numbers found in {label} (hapl.)"
                         else:
-                            if png_file == spectra_files['hap1'].get('spectra_cn_png', None):
-                                text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)"
-                            elif png_file == spectra_files['hap2'].get('spectra_cn_png', None):
-                                text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)"
-                            else:
-                                text = "Distribution of k-mer counts per copy numbers found in asm"
+                            text = "Distribution of k-mer counts per copy numbers found in asm"
                     else:
-                        # For 2 spectra-cn files use same text
-                        text = "Distribution of k-mer counts per copy numbers found in asm"
-                else:
-                    text = filename
-
-                images.append([image, Paragraph(text, styles["midiStyle"])])
+                        text = filename
 
-    # Filter None values
-    images = [img for img in images if img[0] is not None]
-
-    # Get number of rows and columns for the table
-    num_rows = (len(images) + 1) // 2  # +1 to handle odd numbers of images
-    num_columns = 2
+                    images.append([image, Paragraph(text, styles["midiStyle"])])
+                except Exception as e:
+                    logging.error(f"Error processing image {png_file}: {str(e)}")
 
     # Create the table with dynamic size
-    image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
-    image_table = Table(image_table_data)
+    if images:
+        num_rows = (len(images) + 1) // 2
+        num_columns = 2
+        image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
+        image_table = Table(image_table_data)
 
-    # Style the "table"
-    table_style = TableStyle([
-        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
-        ('BOTTOMPADDING', (0, 0), (-1, -1), 20),  # 20 here is a spacer between rows
-    ])
+        # Style the table
+        table_style = TableStyle([
+            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
+            ('BOTTOMPADDING', (0, 0), (-1, -1), 20),  # 20 here is a spacer between rows
+        ])
 
-    # Set the style
-    image_table.setStyle(table_style)
-
-    # Add image table to elements
-    elements.append(image_table)
+        image_table.setStyle(table_style)
+        elements.append(image_table)
+    else:
+        elements.append(Paragraph("No K-mer spectra images available.", styles["midiStyle"]))
 
     # Increase counter by the number of PNGs added
     counter += len(images)