comparison utils.py @ 8:85e6f4b2ad18 draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 8a42eb9b33df7e1df5ad5153b380e20b910a05b6
author goeckslab
date Thu, 14 Aug 2025 14:53:10 +0000
parents 186424a7eca7
children
comparison
equal deleted inserted replaced
7:801a8b6973fb 8:85e6f4b2ad18
6 return """ 6 return """
7 <html> 7 <html>
8 <head> 8 <head>
9 <meta charset="UTF-8"> 9 <meta charset="UTF-8">
10 <title>Galaxy-Ludwig Report</title> 10 <title>Galaxy-Ludwig Report</title>
11
12 <!-- your existing styles -->
11 <style> 13 <style>
12 body { 14 body {
13 font-family: Arial, sans-serif; 15 font-family: Arial, sans-serif;
14 margin: 0; 16 margin: 0;
15 padding: 20px; 17 padding: 20px;
30 h2 { 32 h2 {
31 border-bottom: 2px solid #4CAF50; 33 border-bottom: 2px solid #4CAF50;
32 color: #4CAF50; 34 color: #4CAF50;
33 padding-bottom: 5px; 35 padding-bottom: 5px;
34 } 36 }
37 /* baseline table setup */
35 table { 38 table {
36 border-collapse: collapse; 39 border-collapse: collapse;
37 margin: 20px 0; 40 margin: 20px 0;
38 width: 100%; 41 width: 100%;
39 table-layout: fixed; /* Enforces consistent column widths */ 42 table-layout: fixed;
40 } 43 }
41 table, th, td { 44 table, th, td {
42 border: 1px solid #ddd; 45 border: 1px solid #ddd;
43 } 46 }
44 th, td { 47 th, td {
45 padding: 8px; 48 padding: 8px;
46 text-align: center; /* Center-align text */ 49 text-align: center;
47 vertical-align: middle; /* Center-align content vertically */ 50 vertical-align: middle;
48 word-wrap: break-word; /* Break long words to avoid overflow */ 51 word-wrap: break-word;
49 }
50 th:first-child, td:first-child {
51 width: 5%; /* Smaller width for the first column */
52 }
53 th:nth-child(2), td:nth-child(2) {
54 width: 50%; /* Wider for the metric/description column */
55 }
56 th:last-child, td:last-child {
57 width: 25%; /* Value column gets remaining space */
58 } 52 }
59 th { 53 th {
60 background-color: #4CAF50; 54 background-color: #4CAF50;
61 color: white; 55 color: white;
62 } 56 }
66 } 60 }
67 .plot img { 61 .plot img {
68 max-width: 100%; 62 max-width: 100%;
69 height: auto; 63 height: auto;
70 } 64 }
65
66 /* -------------------
67 SORTABLE COLUMNS
68 ------------------- */
69 table.performance-summary th.sortable {
70 cursor: pointer;
71 position: relative;
72 user-select: none;
73 }
74 /* hide arrows by default */
75 table.performance-summary th.sortable::after {
76 content: '';
77 position: absolute;
78 right: 12px;
79 top: 50%;
80 transform: translateY(-50%);
81 font-size: 0.8em;
82 color: #666;
83 }
84 /* three states */
85 table.performance-summary th.sortable.sorted-none::after {
86 content: '⇅';
87 }
88 table.performance-summary th.sortable.sorted-asc::after {
89 content: '↑';
90 }
91 table.performance-summary th.sortable.sorted-desc::after {
92 content: '↓';
93 }
71 </style> 94 </style>
95
96 <!-- sorting script -->
97 <script>
98 document.addEventListener('DOMContentLoaded', () => {
99 // 1) record each row's original position
100 document.querySelectorAll('table.performance-summary tbody').forEach(tbody => {
101 Array.from(tbody.rows).forEach((row, i) => {
102 row.dataset.originalOrder = i;
103 });
104 });
105
106 const getText = cell => cell.innerText.trim();
107 const comparer = (idx, asc) => (a, b) => {
108 const v1 = getText(a.children[idx]);
109 const v2 = getText(b.children[idx]);
110 const n1 = parseFloat(v1), n2 = parseFloat(v2);
111 if (!isNaN(n1) && !isNaN(n2)) {
112 return asc ? n1 - n2 : n2 - n1;
113 }
114 return asc
115 ? v1.localeCompare(v2)
116 : v2.localeCompare(v1);
117 };
118
119 document
120 .querySelectorAll('table.performance-summary th.sortable')
121 .forEach(th => {
122 // initialize to "none" state
123 th.classList.add('sorted-none');
124 th.addEventListener('click', () => {
125 const table = th.closest('table');
126 const allTh = table.querySelectorAll('th.sortable');
127
128 // 1) determine current state BEFORE clearing classes
129 let curr = th.classList.contains('sorted-asc')
130 ? 'asc'
131 : th.classList.contains('sorted-desc')
132 ? 'desc'
133 : 'none';
134 // 2) cycle to next state
135 let next = curr === 'none'
136 ? 'asc'
137 : curr === 'asc'
138 ? 'desc'
139 : 'none';
140
141 // 3) clear all sort markers
142 allTh.forEach(h =>
143 h.classList.remove('sorted-none','sorted-asc','sorted-desc')
144 );
145 // 4) apply the new marker
146 th.classList.add(`sorted-${next}`);
147
148 // 5) sort or restore original order
149 const tbody = table.querySelector('tbody');
150 let rows = Array.from(tbody.rows);
151 if (next === 'none') {
152 rows.sort((a, b) =>
153 a.dataset.originalOrder - b.dataset.originalOrder
154 );
155 } else {
156 const idx = Array.from(th.parentNode.children).indexOf(th);
157 rows.sort(comparer(idx, next === 'asc'));
158 }
159 rows.forEach(r => tbody.appendChild(r));
160 });
161 });
162 });
163 </script>
72 </head> 164 </head>
73 <body> 165 <body>
74 <div class="container"> 166 <div class="container">
75 """ 167 """
76 168
201 display: block; 293 display: block;
202 }} 294 }}
203 </style> 295 </style>
204 296
205 <div class="tabs"> 297 <div class="tabs">
206 <div class="tab active" onclick="showTab('metrics')">Config &amp; Results Summary</div> 298 <div class="tab active" onclick="showTab('metrics')">Config and Results Summary</div>
207 <div class="tab" onclick="showTab('trainval')">Train/Validation Results</div> 299 <div class="tab" onclick="showTab('trainval')">Train/Validation Results</div>
208 <div class="tab" onclick="showTab('test')">Test Results</div> 300 <div class="tab" onclick="showTab('test')">Test Results</div>
209 <!-- always-visible help button --> 301 <!-- always-visible help button -->
210 <button id="openMetricsHelp" class="help-btn">Help</button> 302 <button id="openMetricsHelp" class="help-btn">Help</button>
211 </div> 303 </div>
230 </script> 322 </script>
231 """ 323 """
232 324
233 325
234 def get_metrics_help_modal() -> str: 326 def get_metrics_help_modal() -> str:
235 modal_html = """ 327 modal_html = (
236 <div id="metricsHelpModal" class="modal"> 328 '<div id="metricsHelpModal" class="modal">'
237 <div class="modal-content"> 329 ' <div class="modal-content">'
238 <span class="close">×</span> 330 ' <span class="close">×</span>'
239 <h2>Model Evaluation Metrics — Help Guide</h2> 331 ' <h2>Model Evaluation Metrics — Help Guide</h2>'
240 <div class="metrics-guide"> 332 ' <div class="metrics-guide">'
241 <h3>1) General Metrics</h3> 333 ' <h3>1) General Metrics (Regression and Classification)</h3>'
242 <p><strong>Loss:</strong> Measures the difference between predicted and actual values. Lower is better. Often used for optimization during training.</p> 334 ' <p><strong>Loss (Regression & Classification):</strong> '
243 <p><strong>Accuracy:</strong> Proportion of correct predictions among all predictions. Simple but can be misleading for imbalanced datasets.</p> 335 'Measures the difference between predicted and actual values, '
244 <p><strong>Micro Accuracy:</strong> Calculates accuracy by summing up all individual true positives and true negatives across all classes, making it suitable for multiclass or multilabel problems.</p> 336 'optimized during training. Lower is better. '
245 <p><strong>Token Accuracy:</strong> Measures how often the predicted tokens (e.g., in sequences) match the true tokens. Useful in sequence prediction tasks like NLP.</p> 337 'For regression, this is often Mean Squared Error (MSE) or '
246 <h3>2) Precision, Recall & Specificity</h3> 338 'Mean Absolute Error (MAE). For classification, it’s typically '
247 <p><strong>Precision:</strong> Out of all positive predictions, how many were correct. Precision = TP / (TP + FP). Helps when false positives are costly.</p> 339 'cross-entropy or log loss.</p>'
248 <p><strong>Recall (Sensitivity):</strong> Out of all actual positives, how many were predicted correctly. Recall = TP / (TP + FN). Important when missing positives is risky.</p> 340 ' <h3>2) Regression Metrics</h3>'
249 <p><strong>Specificity:</strong> True negative rate. Measures how well the model identifies negatives. Specificity = TN / (TN + FP). Useful in medical testing to avoid false alarms.</p> 341 ' <p><strong>Mean Absolute Error (MAE):</strong> '
250 <h3>3) Macro, Micro, and Weighted Averages</h3> 342 'Average of absolute differences between predicted and actual values, '
251 <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric across all classes, treating each class equally, regardless of class frequency. Best when class sizes are balanced.</p> 343 'in the same units as the target. Use for interpretable error measurement '
252 <p><strong>Micro Precision / Recall / F1:</strong> Aggregates TP, FP, FN across all classes before computing the metric. Gives a global view and is ideal for class-imbalanced problems.</p> 344 'when all errors are equally important. Less sensitive to outliers than MSE.</p>'
253 <p><strong>Weighted Precision / Recall / F1:</strong> Averages each metric across classes, weighted by the number of true instances per class. Balances importance of classes based on frequency.</p> 345 ' <p><strong>Mean Squared Error (MSE):</strong> '
254 <h3>4) Average Precision (PR-AUC Variants)</h3> 346 'Average of squared differences between predicted and actual values. '
255 <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged across all classes equally. Useful for balanced multi-class problems.</p> 347 'Penalizes larger errors more heavily, useful when large deviations are critical. '
256 <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC using all instances. Best for imbalanced data or multi-label classification.</p> 348 'Often used as the loss function in regression.</p>'
257 <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged across individual samples (not classes). Ideal for multi-label problems where each sample can belong to multiple classes.</p> 349 ' <p><strong>Root Mean Squared Error (RMSE):</strong> '
258 <h3>5) ROC-AUC Variants</h3> 350 'Square root of MSE, in the same units as the target. '
259 <p><strong>ROC-AUC:</strong> Measures model's ability to distinguish between classes. AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p> 351 'Balances interpretability and sensitivity to large errors. '
260 <p><strong>Macro ROC-AUC:</strong> Averages the AUC across all classes equally. Suitable when classes are balanced and of equal importance.</p> 352 'Widely used for regression evaluation.</p>'
261 <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions across all classes. Useful in multiclass or multilabel settings with imbalance.</p> 353 ' <p><strong>Mean Absolute Percentage Error (MAPE):</strong> '
262 <h3>6) Ranking Metrics</h3> 354 'Average absolute error as a percentage of actual values. '
263 <p><strong>Hits at K:</strong> Measures whether the true label is among the top-K predictions. Common in recommendation systems and retrieval tasks.</p> 355 'Scale-independent, ideal for comparing relative errors across datasets. '
264 <h3>7) Confusion Matrix Stats (Per Class)</h3> 356 'Avoid when actual values are near zero.</p>'
265 <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions for positives and negatives respectively.</p> 357 ' <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> '
266 <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions — false alarms and missed detections.</p> 358 'Square root of mean squared percentage error. Scale-independent, '
267 <h3>8) Other Useful Metrics</h3> 359 'penalizes larger relative errors more than MAPE. Use for forecasting '
268 <p><strong>Cohen's Kappa:</strong> Measures agreement between predicted and actual values adjusted for chance. Useful for multiclass classification with imbalanced labels.</p> 360 'or when relative accuracy matters.</p>'
269 <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure of prediction quality that takes into account TP, TN, FP, and FN. Particularly effective for imbalanced datasets.</p> 361 ' <p><strong>R² Score:</strong> Proportion of variance in the target '
270 <h3>9) Metric Recommendations</h3> 362 'explained by the model. Ranges from negative infinity to 1 (perfect prediction). '
271 <ul> 363 'Use to assess model fit; negative values indicate poor performance '
272 <li>Use <strong>Accuracy + F1</strong> for balanced data.</li> 364 'compared to predicting the mean.</p>'
273 <li>Use <strong>Precision, Recall, ROC-AUC</strong> for imbalanced datasets.</li> 365 ' <h3>3) Classification Metrics</h3>'
274 <li>Use <strong>Average Precision Micro</strong> for multilabel or class-imbalanced problems.</li> 366 ' <p><strong>Accuracy:</strong> Proportion of correct predictions '
275 <li>Use <strong>Macro scores</strong> when all classes should be treated equally.</li> 367 'among all predictions. Simple but misleading for imbalanced datasets, '
276 <li>Use <strong>Weighted scores</strong> when class imbalance should be accounted for without ignoring small classes.</li> 368 'where high accuracy may hide poor performance on minority classes.</p>'
277 <li>Use <strong>Confusion Matrix stats</strong> to analyze class-wise performance.</li> 369 ' <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives '
278 <li>Use <strong>Hits at K</strong> for recommendation or ranking-based tasks.</li> 370 'across all classes before computing accuracy. Suitable for multiclass or '
279 </ul> 371 'multilabel problems with imbalanced data.</p>'
280 </div> 372 ' <p><strong>Token Accuracy:</strong> Measures how often predicted tokens '
281 </div> 373 '(e.g., in sequences) match true tokens. Common in NLP tasks like text generation '
282 </div> 374 'or token classification.</p>'
283 """ 375 ' <p><strong>Precision:</strong> Proportion of positive predictions that are '
284 modal_css = """ 376 'correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>'
285 <style> 377 ' <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives '
286 .modal { 378 'correctly predicted (TP / (TP + FN)). Use when missing positives is risky, '
287 display: none; 379 'e.g., disease detection.</p>'
288 position: fixed; 380 ' <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). '
289 z-index: 1; 381 'Measures ability to identify negatives. Useful in medical testing to avoid '
290 left: 0; 382 'false alarms.</p>'
291 top: 0; 383 ' <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>'
292 width: 100%; 384 ' <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric '
293 height: 100%; 385 'across all classes, treating each equally. Best for balanced datasets where '
294 overflow: auto; 386 'all classes are equally important.</p>'
295 background-color: rgba(0,0,0,0.4); 387 ' <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, '
296 } 388 'false positives, and false negatives across all classes before computing. '
297 .modal-content { 389 'Ideal for imbalanced or multilabel classification.</p>'
298 background-color: #fefefe; 390 ' <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics '
299 margin: 15% auto; 391 'across classes, weighted by the number of true instances per class. Balances '
300 padding: 20px; 392 'class importance based on frequency.</p>'
301 border: 1px solid #888; 393 ' <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>'
302 width: 80%; 394 ' <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged '
303 max-width: 800px; 395 'equally across classes. Use for balanced multiclass problems.</p>'
304 } 396 ' <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC '
305 .close { 397 'using all instances. Best for imbalanced or multilabel classification.</p>'
306 color: #aaa; 398 ' <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged '
307 float: right; 399 'across individual samples. Ideal for multilabel tasks where samples have multiple '
308 font-size: 28px; 400 'labels.</p>'
309 font-weight: bold; 401 ' <h3>6) Classification: ROC-AUC Variants</h3>'
310 } 402 ' <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. '
311 .close:hover, 403 'AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>'
312 .close:focus { 404 ' <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. '
313 color: black; 405 'Suitable for balanced multiclass problems.</p>'
314 text-decoration: none; 406 ' <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions '
315 cursor: pointer; 407 'across all classes. Useful for imbalanced or multilabel settings.</p>'
316 } 408 ' <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>'
317 .metrics-guide h3 { 409 ' <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions '
318 margin-top: 20px; 410 'for positives and negatives, respectively.</p>'
319 } 411 ' <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions '
320 .metrics-guide p { 412 '— false alarms and missed detections.</p>'
321 margin: 5px 0; 413 ' <h3>8) Classification: Ranking Metrics</h3>'
322 } 414 ' <p><strong>Hits at K:</strong> Measures whether the true label is among the '
323 .metrics-guide ul { 415 'top-K predictions. Common in recommendation systems and retrieval tasks.</p>'
324 margin: 10px 0; 416 ' <h3>9) Other Metrics (Classification)</h3>'
325 padding-left: 20px; 417 ' <p><strong>Cohen\'s Kappa:</strong> Measures agreement between predicted and '
326 } 418 'actual labels, adjusted for chance. Useful for multiclass classification with '
327 </style> 419 'imbalanced data.</p>'
328 """ 420 ' <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure '
329 modal_js = """ 421 'using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>'
330 <script> 422 ' <h3>10) Metric Recommendations</h3>'
331 document.addEventListener("DOMContentLoaded", function() { 423 ' <ul>'
332 var modal = document.getElementById("metricsHelpModal"); 424 ' <li><strong>Regression:</strong> Use <strong>RMSE</strong> or '
333 var openBtn = document.getElementById("openMetricsHelp"); 425 '<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative '
334 var span = document.getElementsByClassName("close")[0]; 426 'errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or '
335 if (openBtn && modal) { 427 '<strong>RMSPE</strong> when large errors are critical.</li>'
336 openBtn.onclick = function() { 428 ' <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> '
337 modal.style.display = "block"; 429 'and <strong>F1</strong> for overall performance.</li>'
338 }; 430 ' <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, '
339 } 431 '<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class '
340 if (span && modal) { 432 'performance.</li>'
341 span.onclick = function() { 433 ' <li><strong>Multilabel or Imbalanced Classification:</strong> Use '
342 modal.style.display = "none"; 434 '<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>'
343 }; 435 ' <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> '
344 } 436 'or <strong>Macro ROC-AUC</strong>.</li>'
345 window.onclick = function(event) { 437 ' <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> '
346 if (event.target == modal) { 438 'to account for class imbalance.</li>'
347 modal.style.display = "none"; 439 ' <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>'
348 } 440 ' <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> '
349 } 441 'for class-wise performance in classification.</li>'
350 }); 442 ' </ul>'
351 </script> 443 ' </div>'
352 """ 444 ' </div>'
445 '</div>'
446 )
447 modal_css = (
448 "<style>"
449 ".modal {"
450 " display: none;"
451 " position: fixed;"
452 " z-index: 1;"
453 " left: 0;"
454 " top: 0;"
455 " width: 100%;"
456 " height: 100%;"
457 " overflow: auto;"
458 " background-color: rgba(0,0,0,0.4);"
459 "}"
460 ".modal-content {"
461 " background-color: #fefefe;"
462 " margin: 15% auto;"
463 " padding: 20px;"
464 " border: 1px solid #888;"
465 " width: 80%;"
466 " max-width: 800px;"
467 "}"
468 ".close {"
469 " color: #aaa;"
470 " float: right;"
471 " font-size: 28px;"
472 " font-weight: bold;"
473 "}"
474 ".close:hover,"
475 ".close:focus {"
476 " color: black;"
477 " text-decoration: none;"
478 " cursor: pointer;"
479 "}"
480 ".metrics-guide h3 {"
481 " margin-top: 20px;"
482 "}"
483 ".metrics-guide p {"
484 " margin: 5px 0;"
485 "}"
486 ".metrics-guide ul {"
487 " margin: 10px 0;"
488 " padding-left: 20px;"
489 "}"
490 "</style>"
491 )
492 modal_js = (
493 "<script>"
494 'document.addEventListener("DOMContentLoaded", function() {'
495 ' var modal = document.getElementById("metricsHelpModal");'
496 ' var openBtn = document.getElementById("openMetricsHelp");'
497 ' var span = document.getElementsByClassName("close")[0];'
498 " if (openBtn && modal) {"
499 " openBtn.onclick = function() {"
500 " modal.style.display = \"block\";"
501 " };"
502 " }"
503 " if (span && modal) {"
504 " span.onclick = function() {"
505 " modal.style.display = \"none\";"
506 " };"
507 " }"
508 " window.onclick = function(event) {"
509 " if (event.target == modal) {"
510 " modal.style.display = \"none\";"
511 " }"
512 " }"
513 "});"
514 "</script>"
515 )
353 return modal_css + modal_html + modal_js 516 return modal_css + modal_html + modal_js