erga_ear: make_EAR.py comparison

comparison make_EAR.py @ 2:a34826ae0a73 draft

planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit e293d14e82a903a4cab64dd72dfa3f3798466176

author	bgruening
date	Fri, 30 Aug 2024 09:27:31 +0000
parents	8c99976de71e
children	3dd6be0cd8dd

comparison

equal deleted inserted replaced

-:b61022e1b807
+:a34826ae0a73
 import argparse
-import glob
 import logging
 import math
 import os
 import re
 import sys
 # make_EAR_glxy.py
 # CAUTION: This is for the Galaxy version!
 # by Diego De Panis
 # ERGA Sequencing and Assembly Committee
-EAR_version = "v24.05.20_glxy_beta"
+EAR_version = "v24.08.26"
 def make_report(yaml_file):
 logging.basicConfig(filename='EAR.log', level=logging.INFO)
 # Read the content from EAR.yaml file
 if len(lines) > order:
 target_line = lines[order]
 fifth_column_value = target_line.split('\t')[4].strip()
 return fifth_column_value
 except Exception as e:
-logging.warning(f"Error reading {file_path}: {str(e)}")
+logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}")
 return ''
-# Getting kmer plots for curated asm
-def get_png_files(dir_path):
-png_files = glob.glob(f"{dir_path}/*.ln.png")
-if len(png_files) < 4:
-logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.")
-# fill missing with None
-while len(png_files) < 4:
-png_files.append(None)
-return png_files[:4]
 # get unique part in file names
 def find_unique_parts(file1, file2):
 # Split filenames into parts
 parts1 = file1.split('.')
 parts2 = file2.split('.')
 # Find unique parts
 unique_parts1 = [part for part in parts1 if part not in parts2]
 unique_parts2 = [part for part in parts2 if part not in parts1]
 return ' '.join(unique_parts1), ' '.join(unique_parts2)
 # extract BUSCO values
 def extract_busco_values(file_path):
 try:
 return warnings
 # Parse pipeline and generate "tree"
 def generate_pipeline_tree(pipeline_data):
 tree_lines = []
-indent = "&nbsp;" * 2  # Adjust indent spacing as needed
+indent = "&nbsp;" * 2  # Adjust indent spacing
-for tool_version_param in pipeline_data:
+if isinstance(pipeline_data, dict):
-parts = tool_version_param.split('|')
+for tool, version_param in pipeline_data.items():
-tool_version = parts[0]
+# Tool line
-tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA")
+tool_line = f"- <b>{tool}</b>"
+tree_lines.append(tool_line)
-# Handle parameters: join all but the first (which is tool_version) with ', '
-param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA"
+# Convert version_param to string and split
+version_param_str = str(version_param)
-# Tool line
+parts = version_param_str.split('/')
-tool_line = f"- <b>{tool}</b>"
+version = parts[0]
-tree_lines.append(tool_line)
+params = [p for p in parts[1:] if p]  # This will remove empty strings
 # Version line
-version_line = f"{indent*2}|_ <i>ver:</i> {version}"
+version_line = f"{indent * 2}|_ <i>ver:</i> {version}"
 tree_lines.append(version_line)
 # Param line(s)
-if param_text != "NA":
+if params:
-for param in param_text.split(','):
+for param in params:
-param = param.strip()
+param_line = f"{indent * 2}|_ <i>key param:</i> {param}"
-param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}"
+tree_lines.append(param_line)
+else:
+param_line = f"{indent * 2}|_ <i>key param:</i> NA"
 tree_lines.append(param_line)
 else:
-param_line = f"{indent*2}|_ <i>key param:</i> NA"
+tree_lines.append("Invalid pipeline data format")
-tree_lines.append(param_line)
 # Join lines with HTML break for paragraph
 tree_diagram = "<br/>".join(tree_lines)
 return tree_diagram
 submitter = yaml_data["Submitter"]
 affiliation = yaml_data["Affiliation"]
 tags = yaml_data["Tags"]
 # Check if tag is valid
-valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"]
+valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Community", "ERGA-testing"]
 if tags not in valid_tags:
 tags += "[INVALID TAG]"
-logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite")
+logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Community.")
 # Get data from GoaT based on species name
 # urllib.parse.quote to handle special characters and spaces in the species name
 species_name = requests.utils.quote(species)
 data_values.append('NA' if not coverage else coverage)
 # Create a list of lists for the table
 table_data = [headers, data_values]
-# Extract pipeline data from 'Pre-curation' category
+# Extract pipeline data
-asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', [])
+asm_pipeline_data = yaml_data.get('PIPELINES', {}).get('Assembly', {})
+curation_pipeline_data = yaml_data.get('PIPELINES', {}).get('Curation', {})
+# Extract pipeline data from 'Curated' category
 asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data)
-# Extract pipeline data from 'Curated' category
-curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', [])
 curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data)
 # Reading GENOME PROFILING DATA section from yaml #############################################
 profiling_data = yaml_data.get('PROFILING')
 # Check if profiling_data is available
 if not profiling_data:
 logging.error('Error: No profiling data found in the YAML file.')
 sys.exit(1)
-# Handle GenomeScope specific processing
+# Check for GenomeScope data (mandatory)
 genomescope_data = profiling_data.get('GenomeScope')
-if genomescope_data:
+if not genomescope_data:
-summary_file = genomescope_data.get('genomescope_summary_txt')
+logging.error("Error: GenomeScope data is missing in the YAML file. This is mandatory.")
-if summary_file and os.path.exists(summary_file):
-with open(summary_file, "r") as f:
-summary_txt = f.read()
-genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
-proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt)
-proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA'
-else:
-logging.error(f"File {summary_file} not found for GenomeScope.")
-sys.exit(1)
-else:
-logging.error("GenomeScope data is missing in the PROFILING section.")
 sys.exit(1)
-# Handle Smudgeplot specific processing
+genomescope_summary = genomescope_data.get('genomescope_summary_txt')
+if not genomescope_summary:
+logging.error("Error: GenomeScope summary file path is missing in the YAML file.")
+sys.exit(1)
+# Read the content of the GenomeScope summary file
+try:
+with open(genomescope_summary, "r") as f:
+summary_txt = f.read()
+# Extract values from summary.txt
+genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
+proposed_ploidy = re.search(r"p = (\d+)", summary_txt).group(1)
+except Exception as e:
+logging.error(f"Error reading GenomeScope summary file: {str(e)}")
+sys.exit(1)
+# Check for Smudgeplot data (optional)
 smudgeplot_data = profiling_data.get('Smudgeplot')
 if smudgeplot_data:
-verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
+smudgeplot_summary = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
-if verbose_summary_file and os.path.exists(verbose_summary_file):
+if smudgeplot_summary:
-with open(verbose_summary_file, "r") as f:
+try:
-smud_summary_txt = f.readlines()
+with open(smudgeplot_summary, "r") as f:
-for line in smud_summary_txt:
+smud_summary_txt = f.readlines()
-if line.startswith("* Proposed ploidy"):
+for line in smud_summary_txt:
-proposed_ploidy = line.split(":")[1].strip()
+if line.startswith("* Proposed ploidy"):
-break
+proposed_ploidy = line.split(":")[1].strip()
+break
+except Exception as e:
+logging.warning(f"Error reading Smudgeplot summary file: {str(e)}. Using GenomeScope ploidy.")
 else:
-logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.")
+logging.warning("Smudgeplot summary file path is missing. Using GenomeScope ploidy.")
 else:
-logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.")
+logging.info("Smudgeplot data not provided. Using GenomeScope ploidy.")
 # Reading ASSEMBLY DATA section from yaml #####################################################
 asm_data = yaml_data.get('ASSEMBLIES', {})
 # make a list from the assemblies available in asm_data
 asm_stages = []
 for asm_stage, stage_properties in asm_data.items():
 for haplotypes in stage_properties.keys():
-if haplotypes != 'pipeline' and haplotypes not in asm_stages:
+if haplotypes not in asm_stages:
 asm_stages.append(haplotypes)
 # get gfastats-based data
 gfastats_data = {}
 for asm_stage, stage_properties in asm_data.items():
 gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
 gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
 except (ValueError, ZeroDivisionError):
 gaps_per_gbp_data[(asm_stage, haplotypes)] = ''
-# Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!!
+# Define the contigging table (column names)
 asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]
 # Fill the table with the gfastats data
 for i in range(len(display_names)):
 metric = display_names[i]
 if metric not in exclusion_list:
 asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 # Add the gaps/gbp in between
-gc_index = display_names.index("GC %")
-gc_index
 asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 # get QV, Kmer completeness and BUSCO data
 qv_data = {}
 completeness_data = {}
 busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']}
 for asm_stage, stage_properties in asm_data.items():
-asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline']
+asm_stage_elements = list(stage_properties.keys())
 for i, haplotypes in enumerate(asm_stage_elements):
 haplotype_properties = stage_properties[haplotypes]
 if isinstance(haplotype_properties, dict):
 if 'merqury_qv' in haplotype_properties:
 qv_data[(asm_stage, haplotypes)] = get_qv_value(haplotype_properties['merqury_qv'], i, asm_stage, haplotypes)
 styles = getSampleStyleSheet()
 styles.add(ParagraphStyle(name='TitleStyle', fontName='Courier', fontSize=20))
 styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16))
 styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12))
 styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10))
-styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
+# styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
 styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12))
 styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8))
 styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6))
 # PDF SECTION 1 -------------------------------------------------------------------------------
 # Spacer
 elements.append(Spacer(1, 24))
 # Iterate over haplotypes in the Curated category to get data for EBP metrics
 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
-haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
+haplotype_names = list(curated_assemblies.keys())
 for haplotype in haplotype_names:
 properties = curated_assemblies[haplotype]
 if 'gfastats--nstar-report_txt' in properties and 'merqury_qv' in properties:
 gfastats_path = properties['gfastats--nstar-report_txt']
 elements.append(Spacer(1, 5))
 # Store BUSCO version and lineage information from each file in list
 busco_info_list = []
 for asm_stages, stage_properties in asm_data.items():
-for haplotype_keys, haplotype_properties in stage_properties.items():
+for i, haplotype_properties in stage_properties.items():
 if isinstance(haplotype_properties, dict):
 if 'busco_short_summary_txt' in haplotype_properties:
 busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
 if busco_version and lineage_info:
 busco_info_list.append((busco_version, lineage_info))
 # Initialize counter
 tool_count = 0
 # Add title and images for each step
-for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
+for asm_stages, stage_properties in asm_data.items():
 if asm_stages == 'Curated':
-tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
+tool_elements = list(stage_properties.keys())
 images_with_names = []
 for haplotype in tool_elements:
 haplotype_properties = stage_properties[haplotype]
 if len(tool_elements) > 1 and tool_elements.index(haplotype) < len(tool_elements) - 1:
 images_with_names.append([Spacer(1, 12)])
 # Add images and names to the elements in pairs
 for i in range(0, len(images_with_names), 4):  # Process two images (and their names) at a time
-elements_to_add = images_with_names[i:i + 4]
+elements_to_add = images_with_names[i: i + 4]
 # Create table for the images and names
 table = Table(elements_to_add)
 table.hAlign = 'CENTER'
 elements.append(table)
 # Initialize counter
 counter = 0
 # Iterate over haplotypes in the Curated category to get K-mer spectra images
 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
-haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
 # Get paths for spectra files
 spectra_files = {
 'hap1': {
 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None),
 # Initialize counter
 tool_count = 0
 # Add title and images for each step
-for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
+for asm_stages, stage_properties in asm_data.items():
 if asm_stages == 'Curated':  # Check if the current stage is 'Curated'
-tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
+tool_elements = list(stage_properties.keys())
 for haplotype in tool_elements:
 haplotype_properties = stage_properties[haplotype]
 if isinstance(haplotype_properties, dict) and 'blobplot_cont_png' in haplotype_properties:
 # Get image path

Mercurial > repos > bgruening > erga_ear

comparison make_EAR.py @ 2:a34826ae0a73 draft