changeset 6:61a5641ab89b draft

Uploaded
author maciek
date Mon, 14 Jul 2025 06:51:12 +0000
parents d26f92631414
children 1eac4316c242
files fbqs_integrator/fbqs-integrator.py fbqs_integrator/fbqs_integrator.xml fbqs_integrator/fbqs_integrator/fbqs_integrator/fbqs-integrator.py fbqs_integrator/fbqs_integrator/fbqs_integrator/fbqs_integrator.xml
diffstat 4 files changed, 528 insertions(+), 528 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fbqs_integrator/fbqs-integrator.py	Mon Jul 14 06:51:12 2025 +0000
@@ -0,0 +1,447 @@
+###################################################
+#
+# Includes Fastp, Bracken, and Quast results into STARAMR spreadsheet output
+# Input files are tooldistillator outputs from the Abromics Workflow, in JSON or EXCEL format.
+# Perform quality check at the same time.
+#
+# Created: 2025-01-29
+# Last Edited: 2025-05-14
+#
+# Contributors:
+#
+# Elena Lavinia Diaconu (elena.diaconu@izslt.it)
+# Maciej Kochanowski (maciej.kochanowski@piwet.pulawy.pl)
+# Sebastien Leclercq (sebastien.leclercq@inrae.fr)
+#
+# developed as part of the SPAMR-VET Consortium (EUPAHW)
+###################################################
+
+import os
+import re
+import json
+import argparse
+import numpy as np
+import pandas as pd
+from openpyxl import load_workbook
+
+def extract_sample_id(file_path):
+    try:
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+        
+        if isinstance(data, list):
+            fastp_section = next((i for i in data if i.get('analysis_software_name') == 'fastp'), None)
+            if fastp_section:
+                fastp_report = next((r for r in fastp_section.get('results', []) if r.get('name') == 'fastp_report'), None)
+                if fastp_report and fastp_report.get('content'):
+                    command_str = fastp_report['content'][0].get('command', '')
+                    match = re.search(r'fastp report for ([^\s]+)\.fastqsanger\.gz', command_str)
+                    if match:
+                        return match.group(1)
+        
+        if isinstance(data, dict) and data.get('analysis_software_name') == 'quast':
+            for r in data.get('results', []):
+                if r.get('name') == 'quast_report' and isinstance(r.get('content'), list) and r['content']:
+                    assembly = r['content'][0].get('Assembly', None)
+                    if assembly:
+                        return assembly
+        
+        if isinstance(data, list):
+            quast_section = next((i for i in data if i.get('analysis_software_name') == 'quast'), None)
+            if quast_section and 'results' in quast_section:
+                for r in quast_section['results']:
+                    if r.get('name') == 'quast_report' and isinstance(r.get('content'), list) and r['content']:
+                        assembly = r['content'][0].get('Assembly', None)
+                        if assembly:
+                            return assembly
+    
+    except Exception as e:
+        pass
+    
+    base = os.path.basename(file_path)
+    name = re.sub(r'\.(json|csv|tsv|fasta|fa|xlsx|txt|dat)$', '', base, flags=re.IGNORECASE)
+    for suffix in ['_FastpKraken', '_quast', '_fastp', '_kraken', '_bracken', '_recentrifuge', '_report', '-1']:
+        if name.endswith(suffix):
+            name = name[:-len(suffix)]
+            break
+    return name.strip('_- ')
+
+def safe_float_convert(v, default=np.nan):
+    try:
+        if isinstance(v, str):
+            v = v.replace(',', '')
+        return float(v)
+    except:
+        return default
+
+def safe_int_convert(v, default=np.nan):
+    try:
+        if isinstance(v, str):
+            v = v.replace(',', '')
+        return int(float(v))
+    except:
+        return default
+
+def parse_fastp_json(data):
+    sec = next((i for i in data if i.get('analysis_software_name') == 'fastp'), None)
+    if not sec:
+        raise ValueError("fastp section missing")
+    rep = next((r for r in sec.get('results', []) if r.get('name') == 'fastp_report'), None)
+    if not rep or not rep.get('content'):
+        raise ValueError("fastp_report missing")
+    c = rep['content'][0]
+    pre = c.get('summary', {}).get('before_filtering', {})
+    post = c.get('summary', {}).get('after_filtering', {})
+    filt = c.get('filtering_result', {})
+    dup = c.get('duplication', {})
+    return {
+        'total_reads': pre.get('total_reads'),
+        'total_bases': pre.get('total_bases'),
+        'q20_rate': pre.get('q20_rate'),
+        'q30_rate': pre.get('q30_rate'),
+        'gc_content': pre.get('gc_content'),
+        'duplication_rate': dup.get('rate'),
+        'passed_filter_reads': filt.get('passed_filter_reads'),
+        'low_quality_reads': filt.get('low_quality_reads'),
+        'total_reads_after': post.get('total_reads'),
+        'ratio_reads_after': post.get('total_reads')/pre.get('total_reads') if pre.get('total_reads') else np.nan,
+        'total_bases_after': post.get('total_bases'),
+        'q30_rate_after': post.get('q30_rate'),
+        'gc_content_after': post.get('gc_content')
+    }
+
+def extract_top_taxa_from_bracken(data):
+    sec = next((i for i in data if i.get('analysis_software_name') == 'bracken'), None)
+    result = {f'{p}_{i}': np.nan for i in range(1,6) for p in ['taxon','count','proportion']}
+    if not sec:
+        return result
+    rep = next((r for r in sec.get('results', []) if r.get('name') == 'bracken_report'), None)
+    content = rep.get('content', []) if rep else []
+    taxa = []
+    for item in content:
+        taxa.append({
+            'name': item['name'],
+            'count': item.get('new_est_reads'),
+            'score': item.get('fraction_total_reads')
+        })
+    taxa = sorted(taxa, key=lambda x: x['count'] or 0, reverse=True)[:5]
+    for idx, t in enumerate(taxa, 1):
+        result[f'taxon_{idx}'] = t['name']
+        result[f'count_{idx}'] = safe_int_convert(t['count'])
+        result[f'proportion_{idx}'] = safe_float_convert(t['score'])*100
+    return result
+
+def parse_quast_json(path):
+    try:
+        with open(path, 'r') as f:
+            data = json.load(f)
+        sec = None
+        if isinstance(data, list):
+            sec = next((i for i in data if i.get('analysis_software_name') == 'quast'), None)
+        elif isinstance(data, dict) and data.get('analysis_software_name') == 'quast':
+            sec = data
+        if sec and 'results' in sec:
+            for r in sec['results']:
+                if r.get('name') == 'quast_report' and isinstance(r.get('content'), list) and r['content']:
+                    raw = r['content'][0]
+                    break
+            else:
+                raise ValueError("quast_report content missing")
+        else:
+            raise ValueError("quast section missing")
+        cleaned = {}
+        for k, v in raw.items():
+            key = str(k).strip()
+            key = key.replace(' ', '_').replace('#', 'num').replace("(%", 'percent').replace(")", '').replace("(", '')
+            key = re.sub(r'contigs_>=_?(\d+)_?bp', r'num_contigs_\1bp', key)
+            if key == 'contigs':
+                key = 'num_contigs_500bp'
+            if key == 'GC':
+                key = 'GC_percent'
+            cleaned[key] = v
+        for e in ['N50','Total_length','Largest_contig','num_contigs_500bp','GC_percent']:
+            if e not in cleaned:
+                cleaned[e] = np.nan
+        return cleaned
+    except Exception:
+        try:
+            df = pd.read_csv(path, sep='\t', engine='python')
+        except:
+            df = pd.read_csv(path, sep=',', engine='python')
+        if df.shape[1] == 2 and df.shape[0] > 1:
+            d = pd.Series(df.iloc[:,1].values, index=df.iloc[:,0]).to_dict()
+        else:
+            d = df.iloc[0].to_dict()
+        cleaned = {}
+        for k,v in d.items():
+            key = str(k).strip().replace(' ', '_').replace('#','num').replace('(%)','percent')
+            key = re.sub(r'contigs_>=_?(\d+)_?bp', r'num_contigs_\1bp', key)
+            if key == 'GC':
+                key = 'GC_percent'
+            cleaned[key] = v
+        for e in ['N50','Total_length','Largest_contig','num_contigs_500bp','GC_percent']:
+            if e not in cleaned:
+                cleaned[e] = np.nan
+        return cleaned
+
+def calculate_quality_metrics(fastp, bracken, quast):
+    thresholds = {
+        'total reads after filtering': {'warning': 500000, 'fail': 400000},
+        'ratio of filtered reads': {'warning': 0.8, 'fail': 0.6},
+        'Bracken main taxon proportion': {'warning': 90, 'fail': 80},
+        'reads gc content': {'warning': (0.3,0.7), 'fail': (0.25,0.75)},
+        'N50': {'warning': 35000, 'fail': 30000},
+        'largest contig': {'warning': 100000, 'fail': 50000},
+        'number of contigs >500 bp': {'warning': 400, 'fail': 500},
+        'number of contigs >0 bp': {'warning': 500, 'fail': 700},
+    }
+    fb = []
+    result = 'Passed'
+    def chk(val, name, mode='low'):
+        nonlocal result
+        if pd.isna(val):
+            fb.append(f"Missing {name}")
+            return
+        w = thresholds[name]['warning']
+        f = thresholds[name]['fail']
+        if mode == 'low':
+            if val < f:
+                result, fb_msg = 'Fail', f"Low {name} (<{f})"
+            elif val < w:
+                if result != 'Fail':
+                    result = 'Warning'
+                fb_msg = f"Low {name} (<{w})"
+            else:
+                return
+        elif mode == 'high':
+            if val > f:
+                result, fb_msg = 'Fail', f"High {name} (>{f})"
+            elif val > w:
+                if result != 'Fail':
+                    result = 'Warning'
+                fb_msg = f"High {name} (>{w})"
+            else:
+                return
+        elif mode == 'range':
+            if not (f[0] <= val <= f[1]):
+                result, fb_msg = 'Fail', f"Abnormal {name}"
+            elif not (w[0] <= val <= w[1]):
+                if result != 'Fail':
+                    result = 'Warning'
+                fb_msg = f"Borderline {name}"
+            else:
+                return
+        fb.append(fb_msg)
+    
+    chk(safe_int_convert(fastp.get('total_reads_after')), 'total reads after filtering', 'low')
+    chk(safe_float_convert(fastp.get('ratio_reads_after')), 'ratio of filtered reads', 'low')
+    chk(safe_float_convert(bracken.get('proportion_1')), 'Bracken main taxon proportion', 'low')
+    chk(safe_float_convert(fastp.get('gc_content')), 'reads gc content', 'range')
+    chk(safe_int_convert(quast.get('N50')), 'N50', 'low')
+    chk(safe_int_convert(quast.get('Largest_contig')), 'largest contig','low')
+    chk(safe_int_convert(quast.get('num_contigs_500bp')), 'number of contigs >500 bp', 'high')
+    chk(safe_int_convert(quast.get('num_contigs_0bp')), 'number of contigs >0 bp', 'high')
+    
+    if not fb:
+        fb = ['All tests passed']
+    return {'Quality_Module': result, 'Quality_Feedback': '; '.join(fb)}
+
+def process_samples(fastp_files, quast_files, bracken_files=None):
+    fastp_map = {extract_sample_id(p): p for p in fastp_files}
+    quast_map = {extract_sample_id(p): p for p in quast_files}
+    bracken_map = {extract_sample_id(p): p for p in bracken_files} if bracken_files else {}
+    all_samples = set(fastp_map.keys()) | set(quast_map.keys()) | set(bracken_map.keys())
+    all_data = []
+    seen = set()
+    
+    for sample_id in all_samples:
+        if sample_id in seen:
+            print(f"Warning: duplicate sample ID {sample_id}")
+            continue
+        seen.add(sample_id)
+        
+        entry = {'sample_id': sample_id}
+        
+        if sample_id in fastp_map:
+            try:
+                jd = json.load(open(fastp_map[sample_id]))
+                entry['fastp'] = parse_fastp_json(jd)
+                if not bracken_files:
+                    entry['bracken'] = extract_top_taxa_from_bracken(jd)
+            except Exception as e:
+                print(f"Error parsing FastP for {sample_id}: {e}")
+                entry['fastp'] = {}
+                if not bracken_files:
+                    entry['bracken'] = {}
+        else:
+            entry['fastp'] = {}
+            if not bracken_files:
+                entry['bracken'] = {}
+        
+        if bracken_files:
+            if sample_id in bracken_map:
+                try:
+                    jd = json.load(open(bracken_map[sample_id]))
+                    entry['bracken'] = extract_top_taxa_from_bracken(jd)
+                except Exception as e:
+                    print(f"Error parsing Bracken for {sample_id}: {e}")
+                    entry['bracken'] = {}
+            else:
+                entry['bracken'] = {}
+        
+        if sample_id in quast_map:
+            try:
+                entry['quast'] = parse_quast_json(quast_map[sample_id])
+            except Exception as e:
+                print(f"Error parsing QUAST for {sample_id}: {e}")
+                entry['quast'] = {}
+        else:
+            print(f"Warning: no QUAST file for {sample_id}")
+            entry['quast'] = {}
+        
+        entry['qc'] = calculate_quality_metrics(entry['fastp'], entry['bracken'], entry['quast'])
+        all_data.append(entry)
+    
+    return all_data
+
+def create_aggregated_dataframes(all_data):
+    fp_cols = ['sample_id','total_reads','total_bases','q20_rate','q30_rate','gc_content',
+               'duplication_rate','passed_filter_reads','low_quality_reads',
+               'total_reads_after','total_bases_after','q30_rate_after','gc_content_after','ratio_reads_after']
+    br_cols = ['sample_id'] + [f'{p}_{i}' for i in range(1,6) for p in ['taxon','count','proportion']]
+    q_keys = set(['sample_id'])
+    for e in all_data:
+        q_keys.update(e['quast'].keys())
+    std = ['N50','Total_length','Largest_contig','num_contigs_500bp','GC_percent']
+    qu_cols = ['sample_id'] + [k for k in std if k in q_keys] + sorted([k for k in q_keys if k not in std and k!='sample_id'])
+    qc_cols = ['sample_id','Quality_Module','Quality_Feedback',
+               'total_reads_after','ratio_reads_after','bracken_main_taxon','proportion_of_main_taxon','reads_gc_content','N50','Total_length','Largest_contig','num_contigs_500bp','num_contigs_0bp']
+    
+    fp_list, br_list, qu_list, qc_list = [],[],[],[]
+    for e in all_data:
+        sid = e['sample_id']
+        fp = e['fastp']
+        br = e['bracken']
+        qu = e['quast']
+        qc = e['qc']
+        
+        fp_list.append({c: fp.get(c,np.nan) if c!='sample_id' else sid for c in fp_cols})
+        br_list.append({c: br.get(c,np.nan) if c!='sample_id' else sid for c in br_cols})
+        qu_list.append({c: qu.get(c,np.nan) if c!='sample_id' else sid for c in qu_cols})
+        qc_list.append({
+            **{
+                'sample_id': sid,
+                'Quality_Module': qc['Quality_Module'],
+                'Quality_Feedback': qc['Quality_Feedback']
+            },
+            **{
+                'total_reads_after': safe_int_convert(fp.get('total_reads_after')),
+                'ratio_reads_after': safe_float_convert(fp.get('ratio_reads_after')),
+                'bracken_main_taxon': str(br.get('taxon_1')),
+                'proportion_of_main_taxon': safe_float_convert(br.get('proportion_1')),
+                'reads_gc_content': safe_float_convert(fp.get('gc_content')),
+                'N50': safe_int_convert(qu.get('N50')),
+                'Total_length': safe_int_convert(qu.get('Total_length')),
+                'Largest_contig': safe_int_convert(qu.get('Largest_contig')),
+                'num_contigs_500bp': safe_int_convert(qu.get('num_contigs_500bp')),
+                'num_contigs_0bp': safe_int_convert(qu.get('num_contigs_0bp'))
+            }
+        })
+    
+    return (
+        pd.DataFrame(fp_list, columns=fp_cols),
+        pd.DataFrame(qu_list, columns=qu_cols),
+        pd.DataFrame(br_list, columns=br_cols),
+        pd.DataFrame(qc_list, columns=qc_cols)
+    )
+
+def write_enhanced_excel(staramr_xlsx, output_xlsx, fastp_df, quast_df, bracken_df, quality_df):
+    try:
+        orig = pd.read_excel(staramr_xlsx, sheet_name=None, engine='openpyxl')
+    except Exception as e:
+        print(f"Error reading {staramr_xlsx}: {e}")
+        return
+    
+    if 'Summary' in orig:
+        orig['Summary'] = orig['Summary'].astype({'Quality Module': str,'Quality Module Feedback': str })
+        sdf = orig['Summary']
+        samp = next((c for c in sdf.columns if 'id' in c.lower()), sdf.columns[0])
+        
+        fbcol = 'Quality Module'
+        if fbcol not in sdf.columns:
+            sdf.insert(sdf.columns.get_loc(samp)+1, fbcol, '')
+        fb_map = {
+            str(r['sample_id']): r['Quality_Module'] for _,r in quality_df.iterrows()
+        }
+        fb_df = pd.DataFrame(list(fb_map.items()), columns=[samp, fbcol])
+        fb_df[samp] = fb_df[samp].astype(str)
+        orig['Summary'].update(fb_df, join='left')
+        
+        fbcol = 'Quality Module Feedback'
+        if fbcol not in sdf.columns:
+            sdf.insert(sdf.columns.get_loc(samp)+2, fbcol, '')
+        fb_map = {
+            str(r['sample_id']): r['Quality_Feedback'] for _,r in quality_df.iterrows()
+        }
+        fb_df = pd.DataFrame(list(fb_map.items()), columns=[samp, fbcol])
+        fb_df[samp] = fb_df[samp].astype(str)
+        orig['Summary'].update(fb_df, join='left')
+        
+        fbcol = 'Detected main taxon'
+        if fbcol not in sdf.columns:
+            sdf.insert(sdf.columns.get_loc(samp)+2, fbcol, '')
+        fb_map = {
+            str(r['sample_id']): r['bracken_main_taxon'] for _,r in quality_df.iterrows()
+        }
+        fb_df = pd.DataFrame(list(fb_map.items()), columns=[samp, fbcol])
+        fb_df[samp] = fb_df[samp].astype(str)
+        orig['Summary'].update(fb_df, join='left')
+        
+        orig['Summary'].drop(columns=[f'Scheme'], inplace=True)
+    else:
+        print("Warning: no Summary sheet to update")
+    
+    for name, df in orig.items():
+        if isinstance(df, pd.DataFrame):
+            if 'Isolate ID' in df.columns:
+                df.sort_values(by='Isolate ID', inplace=True)
+                orig[name] = df
+    
+    if not fastp_df.empty:
+        fastp_df.sort_values(by='sample_id', inplace=True)
+    if not quast_df.empty:
+        quast_df.sort_values(by='sample_id', inplace=True)
+    if not bracken_df.empty:
+        bracken_df.sort_values(by='sample_id', inplace=True)
+    if not quality_df.empty:
+        quality_df.sort_values(by='sample_id', inplace=True)
+    
+    with pd.ExcelWriter(output_xlsx, engine='openpyxl') as w:
+        for name, df in orig.items():
+            df.to_excel(w, sheet_name=name, index=False)
+        if not fastp_df.empty:
+            fastp_df.fillna('').to_excel(w, sheet_name='FastP', index=False)
+        if not quast_df.empty:
+            quast_df.fillna('').to_excel(w, sheet_name='Quast', index=False)
+        if not bracken_df.empty:
+            bracken_df.fillna('').to_excel(w, sheet_name='Bracken', index=False)
+        if not quality_df.empty:
+            quality_df.fillna('').to_excel(w, sheet_name='Quality_metrics', index=False)
+    
+    print(f"Enhanced report saved to {output_xlsx}")
+
+def main():
+    p = argparse.ArgumentParser(description="Integrate FastP, Bracken, QUAST into STARAMR Excel report")
+    p.add_argument('--staramr', required=True, help="input STARAMR .xlsx")
+    p.add_argument('--fastp-jsons', nargs='+', required=True, help="FastP JSON files")
+    p.add_argument('--quast-files', nargs='+', required=True, help="QUAST JSON/TSV/CSV files")
+    p.add_argument('--bracken-jsons', nargs='+', required=False, help="Bracken JSON files (optional)")
+    p.add_argument('--output', required=True, help="output enhanced .xlsx")
+    args = p.parse_args()
+    
+    all_data = process_samples(args.fastp_jsons, args.quast_files, args.bracken_jsons)
+    fp_df, qu_df, br_df, qc_df = create_aggregated_dataframes(all_data)
+    write_enhanced_excel(args.staramr, args.output, fp_df, qu_df, br_df, qc_df)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fbqs_integrator/fbqs_integrator.xml	Mon Jul 14 06:51:12 2025 +0000
@@ -0,0 +1,81 @@
+<tool id="fbqs_integrator" name="FBQS Integrator" version="1.0.0" profile="21.05" license="GPL-3.0-or-later">
+    <description>Merge quality-checked outputs from Fastp, Bracken and QUAST into consolidated STARAMR Excel report</description>
+    
+    <requirements>
+        <requirement type="package" version="3.9">python</requirement>
+        <requirement type="package" version="1.5.3">pandas</requirement>
+        <requirement type="package" version="3.0.9">openpyxl</requirement>
+        <requirement type="package" version="1.21.0">numpy</requirement>
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__/fbqs-integrator.py'
+        --staramr '$staramr'
+        --fastp-jsons
+        #for $file in $fastp_jsons
+            '$file'
+        #end for
+        --quast-files
+        #for $file in $quast_files
+            '$file'
+        #end for
+        #if $bracken_jsons
+        --bracken-jsons
+        #for $file in $bracken_jsons
+            '$file'
+        #end for
+        #end if
+        --output '$output'
+    ]]></command>
+
+    <inputs>
+        <param name="staramr" type="data" format="xlsx" label="Input STARAMR report (.xlsx)" 
+               help="STARAMR Excel report from upstream workflow"/>
+        <param name="fastp_jsons" type="data" format="json" multiple="true" 
+               label="Fastp JSON outputs" 
+               help="JSON files from tooldistillator containing Fastp results"/>
+        <param name="quast_files" type="data" format="json,tabular,csv" multiple="true" 
+               label="QUAST outputs" 
+               help="QUAST result files in JSON, TSV, or CSV format"/>
+        <param name="bracken_jsons" type="data" format="json" multiple="true" optional="true"
+               label="Bracken JSON outputs (optional)" 
+               help="Optional separate Bracken JSON files from tooldistillator"/>
+    </inputs>
+
+    <outputs>
+        <data name="output" format="xlsx" label="Enhanced STARAMR report"/>
+    </outputs>
+
+    <help><![CDATA[
+**What it does**
+
+Integrates quality control outputs from the Abromics workflow into a unified STARAMR Excel report.
+
+**Quality Assessment**
+
+Automatically evaluates samples based on thresholds defined in the Python code:
+- Total reads after filtering: Warning <500k, Fail <400k
+- Read filtering ratio: Warning <0.8, Fail <0.6
+- Bracken main taxon proportion: Warning <90%, Fail <80%
+- Assembly metrics (N50, contigs, etc.)
+
+**Outputs**
+
+Enhanced Excel with additional sheets: FastP, Bracken, Quast, Quality_metrics
+    ]]></help>
+
+    <creator>
+        <organization name="SPAMR-VET Consortium"/>
+    </creator>
+
+    <citations>
+        <citation type="bibtex">
+@misc{fbqs_integrator,
+    author = {Diaconu, Elena Lavinia and Kochanowski, Maciej and Leclercq, Sebastien},
+    title = {FBQS Integrator},
+    year = {2025},
+    note = {SPAMR-VET Consortium (EUPAHW)}
+}
+        </citation>
+    </citations>
+</tool>
--- a/fbqs_integrator/fbqs_integrator/fbqs_integrator/fbqs-integrator.py	Mon Jul 14 06:48:32 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,447 +0,0 @@
-###################################################
-#
-# Includes Fastp, Bracken, and Quast results into STARAMR spreadsheet output
-# Input files are tooldistillator outputs from the Abromics Workflow, in JSON or EXCEL format.
-# Perform quality check at the same time.
-#
-# Created: 2025-01-29
-# Last Edited: 2025-05-14
-#
-# Contributors:
-#
-# Elena Lavinia Diaconu (elena.diaconu@izslt.it)
-# Maciej Kochanowski (maciej.kochanowski@piwet.pulawy.pl)
-# Sebastien Leclercq (sebastien.leclercq@inrae.fr)
-#
-# developed as part of the SPAMR-VET Consortium (EUPAHW)
-###################################################
-
-import os
-import re
-import json
-import argparse
-import numpy as np
-import pandas as pd
-from openpyxl import load_workbook
-
-def extract_sample_id(file_path):
-    try:
-        with open(file_path, 'r') as f:
-            data = json.load(f)
-        
-        if isinstance(data, list):
-            fastp_section = next((i for i in data if i.get('analysis_software_name') == 'fastp'), None)
-            if fastp_section:
-                fastp_report = next((r for r in fastp_section.get('results', []) if r.get('name') == 'fastp_report'), None)
-                if fastp_report and fastp_report.get('content'):
-                    command_str = fastp_report['content'][0].get('command', '')
-                    match = re.search(r'fastp report for ([^\s]+)\.fastqsanger\.gz', command_str)
-                    if match:
-                        return match.group(1)
-        
-        if isinstance(data, dict) and data.get('analysis_software_name') == 'quast':
-            for r in data.get('results', []):
-                if r.get('name') == 'quast_report' and isinstance(r.get('content'), list) and r['content']:
-                    assembly = r['content'][0].get('Assembly', None)
-                    if assembly:
-                        return assembly
-        
-        if isinstance(data, list):
-            quast_section = next((i for i in data if i.get('analysis_software_name') == 'quast'), None)
-            if quast_section and 'results' in quast_section:
-                for r in quast_section['results']:
-                    if r.get('name') == 'quast_report' and isinstance(r.get('content'), list) and r['content']:
-                        assembly = r['content'][0].get('Assembly', None)
-                        if assembly:
-                            return assembly
-    
-    except Exception as e:
-        pass
-    
-    base = os.path.basename(file_path)
-    name = re.sub(r'\.(json|csv|tsv|fasta|fa|xlsx|txt|dat)$', '', base, flags=re.IGNORECASE)
-    for suffix in ['_FastpKraken', '_quast', '_fastp', '_kraken', '_bracken', '_recentrifuge', '_report', '-1']:
-        if name.endswith(suffix):
-            name = name[:-len(suffix)]
-            break
-    return name.strip('_- ')
-
-def safe_float_convert(v, default=np.nan):
-    try:
-        if isinstance(v, str):
-            v = v.replace(',', '')
-        return float(v)
-    except:
-        return default
-
-def safe_int_convert(v, default=np.nan):
-    try:
-        if isinstance(v, str):
-            v = v.replace(',', '')
-        return int(float(v))
-    except:
-        return default
-
-def parse_fastp_json(data):
-    sec = next((i for i in data if i.get('analysis_software_name') == 'fastp'), None)
-    if not sec:
-        raise ValueError("fastp section missing")
-    rep = next((r for r in sec.get('results', []) if r.get('name') == 'fastp_report'), None)
-    if not rep or not rep.get('content'):
-        raise ValueError("fastp_report missing")
-    c = rep['content'][0]
-    pre = c.get('summary', {}).get('before_filtering', {})
-    post = c.get('summary', {}).get('after_filtering', {})
-    filt = c.get('filtering_result', {})
-    dup = c.get('duplication', {})
-    return {
-        'total_reads': pre.get('total_reads'),
-        'total_bases': pre.get('total_bases'),
-        'q20_rate': pre.get('q20_rate'),
-        'q30_rate': pre.get('q30_rate'),
-        'gc_content': pre.get('gc_content'),
-        'duplication_rate': dup.get('rate'),
-        'passed_filter_reads': filt.get('passed_filter_reads'),
-        'low_quality_reads': filt.get('low_quality_reads'),
-        'total_reads_after': post.get('total_reads'),
-        'ratio_reads_after': post.get('total_reads')/pre.get('total_reads') if pre.get('total_reads') else np.nan,
-        'total_bases_after': post.get('total_bases'),
-        'q30_rate_after': post.get('q30_rate'),
-        'gc_content_after': post.get('gc_content')
-    }
-
-def extract_top_taxa_from_bracken(data):
-    sec = next((i for i in data if i.get('analysis_software_name') == 'bracken'), None)
-    result = {f'{p}_{i}': np.nan for i in range(1,6) for p in ['taxon','count','proportion']}
-    if not sec:
-        return result
-    rep = next((r for r in sec.get('results', []) if r.get('name') == 'bracken_report'), None)
-    content = rep.get('content', []) if rep else []
-    taxa = []
-    for item in content:
-        taxa.append({
-            'name': item['name'],
-            'count': item.get('new_est_reads'),
-            'score': item.get('fraction_total_reads')
-        })
-    taxa = sorted(taxa, key=lambda x: x['count'] or 0, reverse=True)[:5]
-    for idx, t in enumerate(taxa, 1):
-        result[f'taxon_{idx}'] = t['name']
-        result[f'count_{idx}'] = safe_int_convert(t['count'])
-        result[f'proportion_{idx}'] = safe_float_convert(t['score'])*100
-    return result
-
-def parse_quast_json(path):
-    try:
-        with open(path, 'r') as f:
-            data = json.load(f)
-        sec = None
-        if isinstance(data, list):
-            sec = next((i for i in data if i.get('analysis_software_name') == 'quast'), None)
-        elif isinstance(data, dict) and data.get('analysis_software_name') == 'quast':
-            sec = data
-        if sec and 'results' in sec:
-            for r in sec['results']:
-                if r.get('name') == 'quast_report' and isinstance(r.get('content'), list) and r['content']:
-                    raw = r['content'][0]
-                    break
-            else:
-                raise ValueError("quast_report content missing")
-        else:
-            raise ValueError("quast section missing")
-        cleaned = {}
-        for k, v in raw.items():
-            key = str(k).strip()
-            key = key.replace(' ', '_').replace('#', 'num').replace("(%", 'percent').replace(")", '').replace("(", '')
-            key = re.sub(r'contigs_>=_?(\d+)_?bp', r'num_contigs_\1bp', key)
-            if key == 'contigs':
-                key = 'num_contigs_500bp'
-            if key == 'GC':
-                key = 'GC_percent'
-            cleaned[key] = v
-        for e in ['N50','Total_length','Largest_contig','num_contigs_500bp','GC_percent']:
-            if e not in cleaned:
-                cleaned[e] = np.nan
-        return cleaned
-    except Exception:
-        try:
-            df = pd.read_csv(path, sep='\t', engine='python')
-        except:
-            df = pd.read_csv(path, sep=',', engine='python')
-        if df.shape[1] == 2 and df.shape[0] > 1:
-            d = pd.Series(df.iloc[:,1].values, index=df.iloc[:,0]).to_dict()
-        else:
-            d = df.iloc[0].to_dict()
-        cleaned = {}
-        for k,v in d.items():
-            key = str(k).strip().replace(' ', '_').replace('#','num').replace('(%)','percent')
-            key = re.sub(r'contigs_>=_?(\d+)_?bp', r'num_contigs_\1bp', key)
-            if key == 'GC':
-                key = 'GC_percent'
-            cleaned[key] = v
-        for e in ['N50','Total_length','Largest_contig','num_contigs_500bp','GC_percent']:
-            if e not in cleaned:
-                cleaned[e] = np.nan
-        return cleaned
-
-def calculate_quality_metrics(fastp, bracken, quast):
-    thresholds = {
-        'total reads after filtering': {'warning': 500000, 'fail': 400000},
-        'ratio of filtered reads': {'warning': 0.8, 'fail': 0.6},
-        'Bracken main taxon proportion': {'warning': 90, 'fail': 80},
-        'reads gc content': {'warning': (0.3,0.7), 'fail': (0.25,0.75)},
-        'N50': {'warning': 35000, 'fail': 30000},
-        'largest contig': {'warning': 100000, 'fail': 50000},
-        'number of contigs >500 bp': {'warning': 400, 'fail': 500},
-        'number of contigs >0 bp': {'warning': 500, 'fail': 700},
-    }
-    fb = []
-    result = 'Passed'
-    def chk(val, name, mode='low'):
-        nonlocal result
-        if pd.isna(val):
-            fb.append(f"Missing {name}")
-            return
-        w = thresholds[name]['warning']
-        f = thresholds[name]['fail']
-        if mode == 'low':
-            if val < f:
-                result, fb_msg = 'Fail', f"Low {name} (<{f})"
-            elif val < w:
-                if result != 'Fail':
-                    result = 'Warning'
-                fb_msg = f"Low {name} (<{w})"
-            else:
-                return
-        elif mode == 'high':
-            if val > f:
-                result, fb_msg = 'Fail', f"High {name} (>{f})"
-            elif val > w:
-                if result != 'Fail':
-                    result = 'Warning'
-                fb_msg = f"High {name} (>{w})"
-            else:
-                return
-        elif mode == 'range':
-            if not (f[0] <= val <= f[1]):
-                result, fb_msg = 'Fail', f"Abnormal {name}"
-            elif not (w[0] <= val <= w[1]):
-                if result != 'Fail':
-                    result = 'Warning'
-                fb_msg = f"Borderline {name}"
-            else:
-                return
-        fb.append(fb_msg)
-    
-    chk(safe_int_convert(fastp.get('total_reads_after')), 'total reads after filtering', 'low')
-    chk(safe_float_convert(fastp.get('ratio_reads_after')), 'ratio of filtered reads', 'low')
-    chk(safe_float_convert(bracken.get('proportion_1')), 'Bracken main taxon proportion', 'low')
-    chk(safe_float_convert(fastp.get('gc_content')), 'reads gc content', 'range')
-    chk(safe_int_convert(quast.get('N50')), 'N50', 'low')
-    chk(safe_int_convert(quast.get('Largest_contig')), 'largest contig','low')
-    chk(safe_int_convert(quast.get('num_contigs_500bp')), 'number of contigs >500 bp', 'high')
-    chk(safe_int_convert(quast.get('num_contigs_0bp')), 'number of contigs >0 bp', 'high')
-    
-    if not fb:
-        fb = ['All tests passed']
-    return {'Quality_Module': result, 'Quality_Feedback': '; '.join(fb)}
-
-def process_samples(fastp_files, quast_files, bracken_files=None):
-    fastp_map = {extract_sample_id(p): p for p in fastp_files}
-    quast_map = {extract_sample_id(p): p for p in quast_files}
-    bracken_map = {extract_sample_id(p): p for p in bracken_files} if bracken_files else {}
-    all_samples = set(fastp_map.keys()) | set(quast_map.keys()) | set(bracken_map.keys())
-    all_data = []
-    seen = set()
-    
-    for sample_id in all_samples:
-        if sample_id in seen:
-            print(f"Warning: duplicate sample ID {sample_id}")
-            continue
-        seen.add(sample_id)
-        
-        entry = {'sample_id': sample_id}
-        
-        if sample_id in fastp_map:
-            try:
-                jd = json.load(open(fastp_map[sample_id]))
-                entry['fastp'] = parse_fastp_json(jd)
-                if not bracken_files:
-                    entry['bracken'] = extract_top_taxa_from_bracken(jd)
-            except Exception as e:
-                print(f"Error parsing FastP for {sample_id}: {e}")
-                entry['fastp'] = {}
-                if not bracken_files:
-                    entry['bracken'] = {}
-        else:
-            entry['fastp'] = {}
-            if not bracken_files:
-                entry['bracken'] = {}
-        
-        if bracken_files:
-            if sample_id in bracken_map:
-                try:
-                    jd = json.load(open(bracken_map[sample_id]))
-                    entry['bracken'] = extract_top_taxa_from_bracken(jd)
-                except Exception as e:
-                    print(f"Error parsing Bracken for {sample_id}: {e}")
-                    entry['bracken'] = {}
-            else:
-                entry['bracken'] = {}
-        
-        if sample_id in quast_map:
-            try:
-                entry['quast'] = parse_quast_json(quast_map[sample_id])
-            except Exception as e:
-                print(f"Error parsing QUAST for {sample_id}: {e}")
-                entry['quast'] = {}
-        else:
-            print(f"Warning: no QUAST file for {sample_id}")
-            entry['quast'] = {}
-        
-        entry['qc'] = calculate_quality_metrics(entry['fastp'], entry['bracken'], entry['quast'])
-        all_data.append(entry)
-    
-    return all_data
-
-def create_aggregated_dataframes(all_data):
-    fp_cols = ['sample_id','total_reads','total_bases','q20_rate','q30_rate','gc_content',
-               'duplication_rate','passed_filter_reads','low_quality_reads',
-               'total_reads_after','total_bases_after','q30_rate_after','gc_content_after','ratio_reads_after']
-    br_cols = ['sample_id'] + [f'{p}_{i}' for i in range(1,6) for p in ['taxon','count','proportion']]
-    q_keys = set(['sample_id'])
-    for e in all_data:
-        q_keys.update(e['quast'].keys())
-    std = ['N50','Total_length','Largest_contig','num_contigs_500bp','GC_percent']
-    qu_cols = ['sample_id'] + [k for k in std if k in q_keys] + sorted([k for k in q_keys if k not in std and k!='sample_id'])
-    qc_cols = ['sample_id','Quality_Module','Quality_Feedback',
-               'total_reads_after','ratio_reads_after','bracken_main_taxon','proportion_of_main_taxon','reads_gc_content','N50','Total_length','Largest_contig','num_contigs_500bp','num_contigs_0bp']
-    
-    fp_list, br_list, qu_list, qc_list = [],[],[],[]
-    for e in all_data:
-        sid = e['sample_id']
-        fp = e['fastp']
-        br = e['bracken']
-        qu = e['quast']
-        qc = e['qc']
-        
-        fp_list.append({c: fp.get(c,np.nan) if c!='sample_id' else sid for c in fp_cols})
-        br_list.append({c: br.get(c,np.nan) if c!='sample_id' else sid for c in br_cols})
-        qu_list.append({c: qu.get(c,np.nan) if c!='sample_id' else sid for c in qu_cols})
-        qc_list.append({
-            **{
-                'sample_id': sid,
-                'Quality_Module': qc['Quality_Module'],
-                'Quality_Feedback': qc['Quality_Feedback']
-            },
-            **{
-                'total_reads_after': safe_int_convert(fp.get('total_reads_after')),
-                'ratio_reads_after': safe_float_convert(fp.get('ratio_reads_after')),
-                'bracken_main_taxon': str(br.get('taxon_1')),
-                'proportion_of_main_taxon': safe_float_convert(br.get('proportion_1')),
-                'reads_gc_content': safe_float_convert(fp.get('gc_content')),
-                'N50': safe_int_convert(qu.get('N50')),
-                'Total_length': safe_int_convert(qu.get('Total_length')),
-                'Largest_contig': safe_int_convert(qu.get('Largest_contig')),
-                'num_contigs_500bp': safe_int_convert(qu.get('num_contigs_500bp')),
-                'num_contigs_0bp': safe_int_convert(qu.get('num_contigs_0bp'))
-            }
-        })
-    
-    return (
-        pd.DataFrame(fp_list, columns=fp_cols),
-        pd.DataFrame(qu_list, columns=qu_cols),
-        pd.DataFrame(br_list, columns=br_cols),
-        pd.DataFrame(qc_list, columns=qc_cols)
-    )
-
-def write_enhanced_excel(staramr_xlsx, output_xlsx, fastp_df, quast_df, bracken_df, quality_df):
-    try:
-        orig = pd.read_excel(staramr_xlsx, sheet_name=None, engine='openpyxl')
-    except Exception as e:
-        print(f"Error reading {staramr_xlsx}: {e}")
-        return
-    
-    if 'Summary' in orig:
-        orig['Summary'] = orig['Summary'].astype({'Quality Module': str,'Quality Module Feedback': str })
-        sdf = orig['Summary']
-        samp = next((c for c in sdf.columns if 'id' in c.lower()), sdf.columns[0])
-        
-        fbcol = 'Quality Module'
-        if fbcol not in sdf.columns:
-            sdf.insert(sdf.columns.get_loc(samp)+1, fbcol, '')
-        fb_map = {
-            str(r['sample_id']): r['Quality_Module'] for _,r in quality_df.iterrows()
-        }
-        fb_df = pd.DataFrame(list(fb_map.items()), columns=[samp, fbcol])
-        fb_df[samp] = fb_df[samp].astype(str)
-        orig['Summary'].update(fb_df, join='left')
-        
-        fbcol = 'Quality Module Feedback'
-        if fbcol not in sdf.columns:
-            sdf.insert(sdf.columns.get_loc(samp)+2, fbcol, '')
-        fb_map = {
-            str(r['sample_id']): r['Quality_Feedback'] for _,r in quality_df.iterrows()
-        }
-        fb_df = pd.DataFrame(list(fb_map.items()), columns=[samp, fbcol])
-        fb_df[samp] = fb_df[samp].astype(str)
-        orig['Summary'].update(fb_df, join='left')
-        
-        fbcol = 'Detected main taxon'
-        if fbcol not in sdf.columns:
-            sdf.insert(sdf.columns.get_loc(samp)+2, fbcol, '')
-        fb_map = {
-            str(r['sample_id']): r['bracken_main_taxon'] for _,r in quality_df.iterrows()
-        }
-        fb_df = pd.DataFrame(list(fb_map.items()), columns=[samp, fbcol])
-        fb_df[samp] = fb_df[samp].astype(str)
-        orig['Summary'].update(fb_df, join='left')
-        
-        orig['Summary'].drop(columns=[f'Scheme'], inplace=True)
-    else:
-        print("Warning: no Summary sheet to update")
-    
-    for name, df in orig.items():
-        if isinstance(df, pd.DataFrame):
-            if 'Isolate ID' in df.columns:
-                df.sort_values(by='Isolate ID', inplace=True)
-                orig[name] = df
-    
-    if not fastp_df.empty:
-        fastp_df.sort_values(by='sample_id', inplace=True)
-    if not quast_df.empty:
-        quast_df.sort_values(by='sample_id', inplace=True)
-    if not bracken_df.empty:
-        bracken_df.sort_values(by='sample_id', inplace=True)
-    if not quality_df.empty:
-        quality_df.sort_values(by='sample_id', inplace=True)
-    
-    with pd.ExcelWriter(output_xlsx, engine='openpyxl') as w:
-        for name, df in orig.items():
-            df.to_excel(w, sheet_name=name, index=False)
-        if not fastp_df.empty:
-            fastp_df.fillna('').to_excel(w, sheet_name='FastP', index=False)
-        if not quast_df.empty:
-            quast_df.fillna('').to_excel(w, sheet_name='Quast', index=False)
-        if not bracken_df.empty:
-            bracken_df.fillna('').to_excel(w, sheet_name='Bracken', index=False)
-        if not quality_df.empty:
-            quality_df.fillna('').to_excel(w, sheet_name='Quality_metrics', index=False)
-    
-    print(f"Enhanced report saved to {output_xlsx}")
-
-def main():
-    p = argparse.ArgumentParser(description="Integrate FastP, Bracken, QUAST into STARAMR Excel report")
-    p.add_argument('--staramr', required=True, help="input STARAMR .xlsx")
-    p.add_argument('--fastp-jsons', nargs='+', required=True, help="FastP JSON files")
-    p.add_argument('--quast-files', nargs='+', required=True, help="QUAST JSON/TSV/CSV files")
-    p.add_argument('--bracken-jsons', nargs='+', required=False, help="Bracken JSON files (optional)")
-    p.add_argument('--output', required=True, help="output enhanced .xlsx")
-    args = p.parse_args()
-    
-    all_data = process_samples(args.fastp_jsons, args.quast_files, args.bracken_jsons)
-    fp_df, qu_df, br_df, qc_df = create_aggregated_dataframes(all_data)
-    write_enhanced_excel(args.staramr, args.output, fp_df, qu_df, br_df, qc_df)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
--- a/fbqs_integrator/fbqs_integrator/fbqs_integrator/fbqs_integrator.xml	Mon Jul 14 06:48:32 2025 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,81 +0,0 @@
-<tool id="fbqs_integrator" name="FBQS Integrator" version="1.0.0" profile="21.05" license="GPL-3.0-or-later">
-    <description>Merge quality-checked outputs from Fastp, Bracken and QUAST into consolidated STARAMR Excel report</description>
-    
-    <requirements>
-        <requirement type="package" version="3.9">python</requirement>
-        <requirement type="package" version="1.5.3">pandas</requirement>
-        <requirement type="package" version="3.0.9">openpyxl</requirement>
-        <requirement type="package" version="1.21.0">numpy</requirement>
-    </requirements>
-
-    <command detect_errors="exit_code"><![CDATA[
-        python '$__tool_directory__/fbqs-integrator.py'
-        --staramr '$staramr'
-        --fastp-jsons
-        #for $file in $fastp_jsons
-            '$file'
-        #end for
-        --quast-files
-        #for $file in $quast_files
-            '$file'
-        #end for
-        #if $bracken_jsons
-        --bracken-jsons
-        #for $file in $bracken_jsons
-            '$file'
-        #end for
-        #end if
-        --output '$output'
-    ]]></command>
-
-    <inputs>
-        <param name="staramr" type="data" format="xlsx" label="Input STARAMR report (.xlsx)" 
-               help="STARAMR Excel report from upstream workflow"/>
-        <param name="fastp_jsons" type="data" format="json" multiple="true" 
-               label="Fastp JSON outputs" 
-               help="JSON files from tooldistillator containing Fastp results"/>
-        <param name="quast_files" type="data" format="json,tabular,csv" multiple="true" 
-               label="QUAST outputs" 
-               help="QUAST result files in JSON, TSV, or CSV format"/>
-        <param name="bracken_jsons" type="data" format="json" multiple="true" optional="true"
-               label="Bracken JSON outputs (optional)" 
-               help="Optional separate Bracken JSON files from tooldistillator"/>
-    </inputs>
-
-    <outputs>
-        <data name="output" format="xlsx" label="Enhanced STARAMR report"/>
-    </outputs>
-
-    <help><![CDATA[
-**What it does**
-
-Integrates quality control outputs from the Abromics workflow into a unified STARAMR Excel report.
-
-**Quality Assessment**
-
-Automatically evaluates samples based on thresholds defined in the Python code:
-- Total reads after filtering: Warning <500k, Fail <400k
-- Read filtering ratio: Warning <0.8, Fail <0.6
-- Bracken main taxon proportion: Warning <90%, Fail <80%
-- Assembly metrics (N50, contigs, etc.)
-
-**Outputs**
-
-Enhanced Excel with additional sheets: FastP, Bracken, Quast, Quality_metrics
-    ]]></help>
-
-    <creator>
-        <organization name="SPAMR-VET Consortium"/>
-    </creator>
-
-    <citations>
-        <citation type="bibtex">
-@misc{fbqs_integrator,
-    author = {Diaconu, Elena Lavinia and Kochanowski, Maciej and Leclercq, Sebastien},
-    title = {FBQS Integrator},
-    year = {2025},
-    note = {SPAMR-VET Consortium (EUPAHW)}
-}
-        </citation>
-    </citations>
-</tool>