comparison utils.py @ 9:9e912fce264c draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit eace0d7c2b2939029c052991d238a54947d2e191
author goeckslab
date Wed, 27 Aug 2025 21:02:48 +0000
parents 85e6f4b2ad18
children
comparison
equal deleted inserted replaced
8:85e6f4b2ad18 9:9e912fce264c
6 return """ 6 return """
7 <html> 7 <html>
8 <head> 8 <head>
9 <meta charset="UTF-8"> 9 <meta charset="UTF-8">
10 <title>Galaxy-Ludwig Report</title> 10 <title>Galaxy-Ludwig Report</title>
11
12 <!-- your existing styles -->
13 <style> 11 <style>
14 body { 12 body {
15 font-family: Arial, sans-serif; 13 font-family: Arial, sans-serif;
16 margin: 0; 14 margin: 0;
17 padding: 20px; 15 padding: 20px;
326 def get_metrics_help_modal() -> str: 324 def get_metrics_help_modal() -> str:
327 modal_html = ( 325 modal_html = (
328 '<div id="metricsHelpModal" class="modal">' 326 '<div id="metricsHelpModal" class="modal">'
329 ' <div class="modal-content">' 327 ' <div class="modal-content">'
330 ' <span class="close">×</span>' 328 ' <span class="close">×</span>'
331 ' <h2>Model Evaluation Metrics — Help Guide</h2>' 329 " <h2>Model Evaluation Metrics — Help Guide</h2>"
332 ' <div class="metrics-guide">' 330 ' <div class="metrics-guide">'
333 ' <h3>1) General Metrics (Regression and Classification)</h3>' 331 " <h3>1) General Metrics (Regression and Classification)</h3>"
334 ' <p><strong>Loss (Regression & Classification):</strong> ' 332 " <p><strong>Loss (Regression & Classification):</strong> "
335 'Measures the difference between predicted and actual values, ' 333 "Measures the difference between predicted and actual values, "
336 'optimized during training. Lower is better. ' 334 "optimized during training. Lower is better. "
337 'For regression, this is often Mean Squared Error (MSE) or ' 335 "For regression, this is often Mean Squared Error (MSE) or "
338 'Mean Absolute Error (MAE). For classification, it’s typically ' 336 "Mean Absolute Error (MAE). For classification, it’s typically "
339 'cross-entropy or log loss.</p>' 337 "cross-entropy or log loss.</p>"
340 ' <h3>2) Regression Metrics</h3>' 338 " <h3>2) Regression Metrics</h3>"
341 ' <p><strong>Mean Absolute Error (MAE):</strong> ' 339 " <p><strong>Mean Absolute Error (MAE):</strong> "
342 'Average of absolute differences between predicted and actual values, ' 340 "Average of absolute differences between predicted and actual values, "
343 'in the same units as the target. Use for interpretable error measurement ' 341 "in the same units as the target. Use for interpretable error measurement "
344 'when all errors are equally important. Less sensitive to outliers than MSE.</p>' 342 "when all errors are equally important. Less sensitive to outliers than MSE.</p>"
345 ' <p><strong>Mean Squared Error (MSE):</strong> ' 343 " <p><strong>Mean Squared Error (MSE):</strong> "
346 'Average of squared differences between predicted and actual values. ' 344 "Average of squared differences between predicted and actual values. "
347 'Penalizes larger errors more heavily, useful when large deviations are critical. ' 345 "Penalizes larger errors more heavily, useful when large deviations are critical. "
348 'Often used as the loss function in regression.</p>' 346 "Often used as the loss function in regression.</p>"
349 ' <p><strong>Root Mean Squared Error (RMSE):</strong> ' 347 " <p><strong>Root Mean Squared Error (RMSE):</strong> "
350 'Square root of MSE, in the same units as the target. ' 348 "Square root of MSE, in the same units as the target. "
351 'Balances interpretability and sensitivity to large errors. ' 349 "Balances interpretability and sensitivity to large errors. "
352 'Widely used for regression evaluation.</p>' 350 "Widely used for regression evaluation.</p>"
353 ' <p><strong>Mean Absolute Percentage Error (MAPE):</strong> ' 351 " <p><strong>Mean Absolute Percentage Error (MAPE):</strong> "
354 'Average absolute error as a percentage of actual values. ' 352 "Average absolute error as a percentage of actual values. "
355 'Scale-independent, ideal for comparing relative errors across datasets. ' 353 "Scale-independent, ideal for comparing relative errors across datasets. "
356 'Avoid when actual values are near zero.</p>' 354 "Avoid when actual values are near zero.</p>"
357 ' <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> ' 355 " <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> "
358 'Square root of mean squared percentage error. Scale-independent, ' 356 "Square root of mean squared percentage error. Scale-independent, "
359 'penalizes larger relative errors more than MAPE. Use for forecasting ' 357 "penalizes larger relative errors more than MAPE. Use for forecasting "
360 'or when relative accuracy matters.</p>' 358 "or when relative accuracy matters.</p>"
361 ' <p><strong>R² Score:</strong> Proportion of variance in the target ' 359 " <p><strong>R² Score:</strong> Proportion of variance in the target "
362 'explained by the model. Ranges from negative infinity to 1 (perfect prediction). ' 360 "explained by the model. Ranges from negative infinity to 1 (perfect prediction). "
363 'Use to assess model fit; negative values indicate poor performance ' 361 "Use to assess model fit; negative values indicate poor performance "
364 'compared to predicting the mean.</p>' 362 "compared to predicting the mean.</p>"
365 ' <h3>3) Classification Metrics</h3>' 363 " <h3>3) Classification Metrics</h3>"
366 ' <p><strong>Accuracy:</strong> Proportion of correct predictions ' 364 " <p><strong>Accuracy:</strong> Proportion of correct predictions "
367 'among all predictions. Simple but misleading for imbalanced datasets, ' 365 "among all predictions. Simple but misleading for imbalanced datasets, "
368 'where high accuracy may hide poor performance on minority classes.</p>' 366 "where high accuracy may hide poor performance on minority classes.</p>"
369 ' <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives ' 367 " <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives "
370 'across all classes before computing accuracy. Suitable for multiclass or ' 368 "across all classes before computing accuracy. Suitable for multiclass or "
371 'multilabel problems with imbalanced data.</p>' 369 "multilabel problems with imbalanced data.</p>"
372 ' <p><strong>Token Accuracy:</strong> Measures how often predicted tokens ' 370 " <p><strong>Token Accuracy:</strong> Measures how often predicted tokens "
373 '(e.g., in sequences) match true tokens. Common in NLP tasks like text generation ' 371 "(e.g., in sequences) match true tokens. Common in NLP tasks like text generation "
374 'or token classification.</p>' 372 "or token classification.</p>"
375 ' <p><strong>Precision:</strong> Proportion of positive predictions that are ' 373 " <p><strong>Precision:</strong> Proportion of positive predictions that are "
376 'correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>' 374 "correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>"
377 ' <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives ' 375 " <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives "
378 'correctly predicted (TP / (TP + FN)). Use when missing positives is risky, ' 376 "correctly predicted (TP / (TP + FN)). Use when missing positives is risky, "
379 'e.g., disease detection.</p>' 377 "e.g., disease detection.</p>"
380 ' <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). ' 378 " <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). "
381 'Measures ability to identify negatives. Useful in medical testing to avoid ' 379 "Measures ability to identify negatives. Useful in medical testing to avoid "
382 'false alarms.</p>' 380 "false alarms.</p>"
383 ' <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>' 381 " <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>"
384 ' <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric ' 382 " <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric "
385 'across all classes, treating each equally. Best for balanced datasets where ' 383 "across all classes, treating each equally. Best for balanced datasets where "
386 'all classes are equally important.</p>' 384 "all classes are equally important.</p>"
387 ' <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, ' 385 " <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, "
388 'false positives, and false negatives across all classes before computing. ' 386 "false positives, and false negatives across all classes before computing. "
389 'Ideal for imbalanced or multilabel classification.</p>' 387 "Ideal for imbalanced or multilabel classification.</p>"
390 ' <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics ' 388 " <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics "
391 'across classes, weighted by the number of true instances per class. Balances ' 389 "across classes, weighted by the number of true instances per class. Balances "
392 'class importance based on frequency.</p>' 390 "class importance based on frequency.</p>"
393 ' <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>' 391 " <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>"
394 ' <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged ' 392 " <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged "
395 'equally across classes. Use for balanced multiclass problems.</p>' 393 "equally across classes. Use for balanced multiclass problems.</p>"
396 ' <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC ' 394 " <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC "
397 'using all instances. Best for imbalanced or multilabel classification.</p>' 395 "using all instances. Best for imbalanced or multilabel classification.</p>"
398 ' <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged ' 396 " <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged "
399 'across individual samples. Ideal for multilabel tasks where samples have multiple ' 397 "across individual samples. Ideal for multilabel tasks where samples have multiple "
400 'labels.</p>' 398 "labels.</p>"
401 ' <h3>6) Classification: ROC-AUC Variants</h3>' 399 " <h3>6) Classification: ROC-AUC Variants</h3>"
402 ' <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. ' 400 " <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. "
403 'AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>' 401 "AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>"
404 ' <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. ' 402 " <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. "
405 'Suitable for balanced multiclass problems.</p>' 403 "Suitable for balanced multiclass problems.</p>"
406 ' <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions ' 404 " <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions "
407 'across all classes. Useful for imbalanced or multilabel settings.</p>' 405 "across all classes. Useful for imbalanced or multilabel settings.</p>"
408 ' <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>' 406 " <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>"
409 ' <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions ' 407 " <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions "
410 'for positives and negatives, respectively.</p>' 408 "for positives and negatives, respectively.</p>"
411 ' <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions ' 409 " <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions "
412 '— false alarms and missed detections.</p>' 410 "— false alarms and missed detections.</p>"
413 ' <h3>8) Classification: Ranking Metrics</h3>' 411 " <h3>8) Classification: Ranking Metrics</h3>"
414 ' <p><strong>Hits at K:</strong> Measures whether the true label is among the ' 412 " <p><strong>Hits at K:</strong> Measures whether the true label is among the "
415 'top-K predictions. Common in recommendation systems and retrieval tasks.</p>' 413 "top-K predictions. Common in recommendation systems and retrieval tasks.</p>"
416 ' <h3>9) Other Metrics (Classification)</h3>' 414 " <h3>9) Other Metrics (Classification)</h3>"
417 ' <p><strong>Cohen\'s Kappa:</strong> Measures agreement between predicted and ' 415 " <p><strong>Cohen's Kappa:</strong> Measures agreement between predicted and "
418 'actual labels, adjusted for chance. Useful for multiclass classification with ' 416 "actual labels, adjusted for chance. Useful for multiclass classification with "
419 'imbalanced data.</p>' 417 "imbalanced data.</p>"
420 ' <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure ' 418 " <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure "
421 'using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>' 419 "using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>"
422 ' <h3>10) Metric Recommendations</h3>' 420 " <h3>10) Metric Recommendations</h3>"
423 ' <ul>' 421 " <ul>"
424 ' <li><strong>Regression:</strong> Use <strong>RMSE</strong> or ' 422 " <li><strong>Regression:</strong> Use <strong>RMSE</strong> or "
425 '<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative ' 423 "<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative "
426 'errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or ' 424 "errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or "
427 '<strong>RMSPE</strong> when large errors are critical.</li>' 425 "<strong>RMSPE</strong> when large errors are critical.</li>"
428 ' <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> ' 426 " <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> "
429 'and <strong>F1</strong> for overall performance.</li>' 427 "and <strong>F1</strong> for overall performance.</li>"
430 ' <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, ' 428 " <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, "
431 '<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class ' 429 "<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class "
432 'performance.</li>' 430 "performance.</li>"
433 ' <li><strong>Multilabel or Imbalanced Classification:</strong> Use ' 431 " <li><strong>Multilabel or Imbalanced Classification:</strong> Use "
434 '<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>' 432 "<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>"
435 ' <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> ' 433 " <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> "
436 'or <strong>Macro ROC-AUC</strong>.</li>' 434 "or <strong>Macro ROC-AUC</strong>.</li>"
437 ' <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> ' 435 " <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> "
438 'to account for class imbalance.</li>' 436 "to account for class imbalance.</li>"
439 ' <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>' 437 " <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>"
440 ' <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> ' 438 " <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> "
441 'for class-wise performance in classification.</li>' 439 "for class-wise performance in classification.</li>"
442 ' </ul>' 440 " </ul>"
443 ' </div>' 441 " </div>"
444 ' </div>' 442 " </div>"
445 '</div>' 443 "</div>"
446 ) 444 )
447 modal_css = ( 445 modal_css = (
448 "<style>" 446 "<style>"
449 ".modal {" 447 ".modal {"
450 " display: none;" 448 " display: none;"
495 ' var modal = document.getElementById("metricsHelpModal");' 493 ' var modal = document.getElementById("metricsHelpModal");'
496 ' var openBtn = document.getElementById("openMetricsHelp");' 494 ' var openBtn = document.getElementById("openMetricsHelp");'
497 ' var span = document.getElementsByClassName("close")[0];' 495 ' var span = document.getElementsByClassName("close")[0];'
498 " if (openBtn && modal) {" 496 " if (openBtn && modal) {"
499 " openBtn.onclick = function() {" 497 " openBtn.onclick = function() {"
500 " modal.style.display = \"block\";" 498 ' modal.style.display = "block";'
501 " };" 499 " };"
502 " }" 500 " }"
503 " if (span && modal) {" 501 " if (span && modal) {"
504 " span.onclick = function() {" 502 " span.onclick = function() {"
505 " modal.style.display = \"none\";" 503 ' modal.style.display = "none";'
506 " };" 504 " };"
507 " }" 505 " }"
508 " window.onclick = function(event) {" 506 " window.onclick = function(event) {"
509 " if (event.target == modal) {" 507 " if (event.target == modal) {"
510 " modal.style.display = \"none\";" 508 ' modal.style.display = "none";'
511 " }" 509 " }"
512 " }" 510 " }"
513 "});" 511 "});"
514 "</script>" 512 "</script>"
515 ) 513 )