Mercurial > repos > goeckslab > image_learner
diff utils.py @ 8:85e6f4b2ad18 draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 8a42eb9b33df7e1df5ad5153b380e20b910a05b6
author | goeckslab |
---|---|
date | Thu, 14 Aug 2025 14:53:10 +0000 |
parents | 186424a7eca7 |
children |
line wrap: on
line diff
--- a/utils.py Fri Aug 08 13:06:28 2025 +0000 +++ b/utils.py Thu Aug 14 14:53:10 2025 +0000 @@ -8,6 +8,8 @@ <head> <meta charset="UTF-8"> <title>Galaxy-Ludwig Report</title> + + <!-- your existing styles --> <style> body { font-family: Arial, sans-serif; @@ -32,29 +34,21 @@ color: #4CAF50; padding-bottom: 5px; } + /* baseline table setup */ table { border-collapse: collapse; margin: 20px 0; width: 100%; - table-layout: fixed; /* Enforces consistent column widths */ + table-layout: fixed; } table, th, td { border: 1px solid #ddd; } th, td { padding: 8px; - text-align: center; /* Center-align text */ - vertical-align: middle; /* Center-align content vertically */ - word-wrap: break-word; /* Break long words to avoid overflow */ - } - th:first-child, td:first-child { - width: 5%; /* Smaller width for the first column */ - } - th:nth-child(2), td:nth-child(2) { - width: 50%; /* Wider for the metric/description column */ - } - th:last-child, td:last-child { - width: 25%; /* Value column gets remaining space */ + text-align: center; + vertical-align: middle; + word-wrap: break-word; } th { background-color: #4CAF50; @@ -68,7 +62,105 @@ max-width: 100%; height: auto; } + + /* ------------------- + SORTABLE COLUMNS + ------------------- */ + table.performance-summary th.sortable { + cursor: pointer; + position: relative; + user-select: none; + } + /* hide arrows by default */ + table.performance-summary th.sortable::after { + content: ''; + position: absolute; + right: 12px; + top: 50%; + transform: translateY(-50%); + font-size: 0.8em; + color: #666; + } + /* three states */ + table.performance-summary th.sortable.sorted-none::after { + content: '⇅'; + } + table.performance-summary th.sortable.sorted-asc::after { + content: '↑'; + } + table.performance-summary th.sortable.sorted-desc::after { + content: '↓'; + } </style> + + <!-- sorting script --> + <script> + document.addEventListener('DOMContentLoaded', () => { + // 1) record each row's original position + document.querySelectorAll('table.performance-summary tbody').forEach(tbody => { + Array.from(tbody.rows).forEach((row, i) => { + row.dataset.originalOrder = i; + }); + }); + + const getText = cell => cell.innerText.trim(); + const comparer = (idx, asc) => (a, b) => { + const v1 = getText(a.children[idx]); + const v2 = getText(b.children[idx]); + const n1 = parseFloat(v1), n2 = parseFloat(v2); + if (!isNaN(n1) && !isNaN(n2)) { + return asc ? n1 - n2 : n2 - n1; + } + return asc + ? v1.localeCompare(v2) + : v2.localeCompare(v1); + }; + + document + .querySelectorAll('table.performance-summary th.sortable') + .forEach(th => { + // initialize to "none" state + th.classList.add('sorted-none'); + th.addEventListener('click', () => { + const table = th.closest('table'); + const allTh = table.querySelectorAll('th.sortable'); + + // 1) determine current state BEFORE clearing classes + let curr = th.classList.contains('sorted-asc') + ? 'asc' + : th.classList.contains('sorted-desc') + ? 'desc' + : 'none'; + // 2) cycle to next state + let next = curr === 'none' + ? 'asc' + : curr === 'asc' + ? 'desc' + : 'none'; + + // 3) clear all sort markers + allTh.forEach(h => + h.classList.remove('sorted-none','sorted-asc','sorted-desc') + ); + // 4) apply the new marker + th.classList.add(`sorted-${next}`); + + // 5) sort or restore original order + const tbody = table.querySelector('tbody'); + let rows = Array.from(tbody.rows); + if (next === 'none') { + rows.sort((a, b) => + a.dataset.originalOrder - b.dataset.originalOrder + ); + } else { + const idx = Array.from(th.parentNode.children).indexOf(th); + rows.sort(comparer(idx, next === 'asc')); + } + rows.forEach(r => tbody.appendChild(r)); + }); + }); + }); + </script> </head> <body> <div class="container"> @@ -203,7 +295,7 @@ </style> <div class="tabs"> - <div class="tab active" onclick="showTab('metrics')">Config & Results Summary</div> + <div class="tab active" onclick="showTab('metrics')">Config and Results Summary</div> <div class="tab" onclick="showTab('trainval')">Train/Validation Results</div> <div class="tab" onclick="showTab('test')">Test Results</div> <!-- always-visible help button --> @@ -232,122 +324,193 @@ def get_metrics_help_modal() -> str: - modal_html = """ -<div id="metricsHelpModal" class="modal"> - <div class="modal-content"> - <span class="close">×</span> - <h2>Model Evaluation Metrics — Help Guide</h2> - <div class="metrics-guide"> - <h3>1) General Metrics</h3> - <p><strong>Loss:</strong> Measures the difference between predicted and actual values. Lower is better. Often used for optimization during training.</p> - <p><strong>Accuracy:</strong> Proportion of correct predictions among all predictions. Simple but can be misleading for imbalanced datasets.</p> - <p><strong>Micro Accuracy:</strong> Calculates accuracy by summing up all individual true positives and true negatives across all classes, making it suitable for multiclass or multilabel problems.</p> - <p><strong>Token Accuracy:</strong> Measures how often the predicted tokens (e.g., in sequences) match the true tokens. Useful in sequence prediction tasks like NLP.</p> - <h3>2) Precision, Recall & Specificity</h3> - <p><strong>Precision:</strong> Out of all positive predictions, how many were correct. Precision = TP / (TP + FP). Helps when false positives are costly.</p> - <p><strong>Recall (Sensitivity):</strong> Out of all actual positives, how many were predicted correctly. Recall = TP / (TP + FN). Important when missing positives is risky.</p> - <p><strong>Specificity:</strong> True negative rate. Measures how well the model identifies negatives. Specificity = TN / (TN + FP). Useful in medical testing to avoid false alarms.</p> - <h3>3) Macro, Micro, and Weighted Averages</h3> - <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric across all classes, treating each class equally, regardless of class frequency. Best when class sizes are balanced.</p> - <p><strong>Micro Precision / Recall / F1:</strong> Aggregates TP, FP, FN across all classes before computing the metric. Gives a global view and is ideal for class-imbalanced problems.</p> - <p><strong>Weighted Precision / Recall / F1:</strong> Averages each metric across classes, weighted by the number of true instances per class. Balances importance of classes based on frequency.</p> - <h3>4) Average Precision (PR-AUC Variants)</h3> - <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged across all classes equally. Useful for balanced multi-class problems.</p> - <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC using all instances. Best for imbalanced data or multi-label classification.</p> - <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged across individual samples (not classes). Ideal for multi-label problems where each sample can belong to multiple classes.</p> - <h3>5) ROC-AUC Variants</h3> - <p><strong>ROC-AUC:</strong> Measures model's ability to distinguish between classes. AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p> - <p><strong>Macro ROC-AUC:</strong> Averages the AUC across all classes equally. Suitable when classes are balanced and of equal importance.</p> - <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions across all classes. Useful in multiclass or multilabel settings with imbalance.</p> - <h3>6) Ranking Metrics</h3> - <p><strong>Hits at K:</strong> Measures whether the true label is among the top-K predictions. Common in recommendation systems and retrieval tasks.</p> - <h3>7) Confusion Matrix Stats (Per Class)</h3> - <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions for positives and negatives respectively.</p> - <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions — false alarms and missed detections.</p> - <h3>8) Other Useful Metrics</h3> - <p><strong>Cohen's Kappa:</strong> Measures agreement between predicted and actual values adjusted for chance. Useful for multiclass classification with imbalanced labels.</p> - <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure of prediction quality that takes into account TP, TN, FP, and FN. Particularly effective for imbalanced datasets.</p> - <h3>9) Metric Recommendations</h3> - <ul> - <li>Use <strong>Accuracy + F1</strong> for balanced data.</li> - <li>Use <strong>Precision, Recall, ROC-AUC</strong> for imbalanced datasets.</li> - <li>Use <strong>Average Precision Micro</strong> for multilabel or class-imbalanced problems.</li> - <li>Use <strong>Macro scores</strong> when all classes should be treated equally.</li> - <li>Use <strong>Weighted scores</strong> when class imbalance should be accounted for without ignoring small classes.</li> - <li>Use <strong>Confusion Matrix stats</strong> to analyze class-wise performance.</li> - <li>Use <strong>Hits at K</strong> for recommendation or ranking-based tasks.</li> - </ul> - </div> - </div> -</div> -""" - modal_css = """ -<style> -.modal { - display: none; - position: fixed; - z-index: 1; - left: 0; - top: 0; - width: 100%; - height: 100%; - overflow: auto; - background-color: rgba(0,0,0,0.4); -} -.modal-content { - background-color: #fefefe; - margin: 15% auto; - padding: 20px; - border: 1px solid #888; - width: 80%; - max-width: 800px; -} -.close { - color: #aaa; - float: right; - font-size: 28px; - font-weight: bold; -} -.close:hover, -.close:focus { - color: black; - text-decoration: none; - cursor: pointer; -} -.metrics-guide h3 { - margin-top: 20px; -} -.metrics-guide p { - margin: 5px 0; -} -.metrics-guide ul { - margin: 10px 0; - padding-left: 20px; -} -</style> -""" - modal_js = """ -<script> -document.addEventListener("DOMContentLoaded", function() { - var modal = document.getElementById("metricsHelpModal"); - var openBtn = document.getElementById("openMetricsHelp"); - var span = document.getElementsByClassName("close")[0]; - if (openBtn && modal) { - openBtn.onclick = function() { - modal.style.display = "block"; - }; - } - if (span && modal) { - span.onclick = function() { - modal.style.display = "none"; - }; - } - window.onclick = function(event) { - if (event.target == modal) { - modal.style.display = "none"; - } - } -}); -</script> -""" + modal_html = ( + '<div id="metricsHelpModal" class="modal">' + ' <div class="modal-content">' + ' <span class="close">×</span>' + ' <h2>Model Evaluation Metrics — Help Guide</h2>' + ' <div class="metrics-guide">' + ' <h3>1) General Metrics (Regression and Classification)</h3>' + ' <p><strong>Loss (Regression & Classification):</strong> ' + 'Measures the difference between predicted and actual values, ' + 'optimized during training. Lower is better. ' + 'For regression, this is often Mean Squared Error (MSE) or ' + 'Mean Absolute Error (MAE). For classification, it’s typically ' + 'cross-entropy or log loss.</p>' + ' <h3>2) Regression Metrics</h3>' + ' <p><strong>Mean Absolute Error (MAE):</strong> ' + 'Average of absolute differences between predicted and actual values, ' + 'in the same units as the target. Use for interpretable error measurement ' + 'when all errors are equally important. Less sensitive to outliers than MSE.</p>' + ' <p><strong>Mean Squared Error (MSE):</strong> ' + 'Average of squared differences between predicted and actual values. ' + 'Penalizes larger errors more heavily, useful when large deviations are critical. ' + 'Often used as the loss function in regression.</p>' + ' <p><strong>Root Mean Squared Error (RMSE):</strong> ' + 'Square root of MSE, in the same units as the target. ' + 'Balances interpretability and sensitivity to large errors. ' + 'Widely used for regression evaluation.</p>' + ' <p><strong>Mean Absolute Percentage Error (MAPE):</strong> ' + 'Average absolute error as a percentage of actual values. ' + 'Scale-independent, ideal for comparing relative errors across datasets. ' + 'Avoid when actual values are near zero.</p>' + ' <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> ' + 'Square root of mean squared percentage error. Scale-independent, ' + 'penalizes larger relative errors more than MAPE. Use for forecasting ' + 'or when relative accuracy matters.</p>' + ' <p><strong>R² Score:</strong> Proportion of variance in the target ' + 'explained by the model. Ranges from negative infinity to 1 (perfect prediction). ' + 'Use to assess model fit; negative values indicate poor performance ' + 'compared to predicting the mean.</p>' + ' <h3>3) Classification Metrics</h3>' + ' <p><strong>Accuracy:</strong> Proportion of correct predictions ' + 'among all predictions. Simple but misleading for imbalanced datasets, ' + 'where high accuracy may hide poor performance on minority classes.</p>' + ' <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives ' + 'across all classes before computing accuracy. Suitable for multiclass or ' + 'multilabel problems with imbalanced data.</p>' + ' <p><strong>Token Accuracy:</strong> Measures how often predicted tokens ' + '(e.g., in sequences) match true tokens. Common in NLP tasks like text generation ' + 'or token classification.</p>' + ' <p><strong>Precision:</strong> Proportion of positive predictions that are ' + 'correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>' + ' <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives ' + 'correctly predicted (TP / (TP + FN)). Use when missing positives is risky, ' + 'e.g., disease detection.</p>' + ' <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). ' + 'Measures ability to identify negatives. Useful in medical testing to avoid ' + 'false alarms.</p>' + ' <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>' + ' <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric ' + 'across all classes, treating each equally. Best for balanced datasets where ' + 'all classes are equally important.</p>' + ' <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, ' + 'false positives, and false negatives across all classes before computing. ' + 'Ideal for imbalanced or multilabel classification.</p>' + ' <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics ' + 'across classes, weighted by the number of true instances per class. Balances ' + 'class importance based on frequency.</p>' + ' <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>' + ' <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged ' + 'equally across classes. Use for balanced multiclass problems.</p>' + ' <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC ' + 'using all instances. Best for imbalanced or multilabel classification.</p>' + ' <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged ' + 'across individual samples. Ideal for multilabel tasks where samples have multiple ' + 'labels.</p>' + ' <h3>6) Classification: ROC-AUC Variants</h3>' + ' <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. ' + 'AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>' + ' <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. ' + 'Suitable for balanced multiclass problems.</p>' + ' <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions ' + 'across all classes. Useful for imbalanced or multilabel settings.</p>' + ' <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>' + ' <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions ' + 'for positives and negatives, respectively.</p>' + ' <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions ' + '— false alarms and missed detections.</p>' + ' <h3>8) Classification: Ranking Metrics</h3>' + ' <p><strong>Hits at K:</strong> Measures whether the true label is among the ' + 'top-K predictions. Common in recommendation systems and retrieval tasks.</p>' + ' <h3>9) Other Metrics (Classification)</h3>' + ' <p><strong>Cohen\'s Kappa:</strong> Measures agreement between predicted and ' + 'actual labels, adjusted for chance. Useful for multiclass classification with ' + 'imbalanced data.</p>' + ' <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure ' + 'using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>' + ' <h3>10) Metric Recommendations</h3>' + ' <ul>' + ' <li><strong>Regression:</strong> Use <strong>RMSE</strong> or ' + '<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative ' + 'errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or ' + '<strong>RMSPE</strong> when large errors are critical.</li>' + ' <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> ' + 'and <strong>F1</strong> for overall performance.</li>' + ' <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, ' + '<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class ' + 'performance.</li>' + ' <li><strong>Multilabel or Imbalanced Classification:</strong> Use ' + '<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>' + ' <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> ' + 'or <strong>Macro ROC-AUC</strong>.</li>' + ' <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> ' + 'to account for class imbalance.</li>' + ' <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>' + ' <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> ' + 'for class-wise performance in classification.</li>' + ' </ul>' + ' </div>' + ' </div>' + '</div>' + ) + modal_css = ( + "<style>" + ".modal {" + " display: none;" + " position: fixed;" + " z-index: 1;" + " left: 0;" + " top: 0;" + " width: 100%;" + " height: 100%;" + " overflow: auto;" + " background-color: rgba(0,0,0,0.4);" + "}" + ".modal-content {" + " background-color: #fefefe;" + " margin: 15% auto;" + " padding: 20px;" + " border: 1px solid #888;" + " width: 80%;" + " max-width: 800px;" + "}" + ".close {" + " color: #aaa;" + " float: right;" + " font-size: 28px;" + " font-weight: bold;" + "}" + ".close:hover," + ".close:focus {" + " color: black;" + " text-decoration: none;" + " cursor: pointer;" + "}" + ".metrics-guide h3 {" + " margin-top: 20px;" + "}" + ".metrics-guide p {" + " margin: 5px 0;" + "}" + ".metrics-guide ul {" + " margin: 10px 0;" + " padding-left: 20px;" + "}" + "</style>" + ) + modal_js = ( + "<script>" + 'document.addEventListener("DOMContentLoaded", function() {' + ' var modal = document.getElementById("metricsHelpModal");' + ' var openBtn = document.getElementById("openMetricsHelp");' + ' var span = document.getElementsByClassName("close")[0];' + " if (openBtn && modal) {" + " openBtn.onclick = function() {" + " modal.style.display = \"block\";" + " };" + " }" + " if (span && modal) {" + " span.onclick = function() {" + " modal.style.display = \"none\";" + " };" + " }" + " window.onclick = function(event) {" + " if (event.target == modal) {" + " modal.style.display = \"none\";" + " }" + " }" + "});" + "</script>" + ) return modal_css + modal_html + modal_js