diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py new file mode 100644 index 000000000..1a3c376c0 --- /dev/null +++ b/packages/uipath/examples/dataset_evaluators_demo.py @@ -0,0 +1,295 @@ +"""Runnable proof that the dataset-level evaluators work on realistic data. + +Five scenarios exercise the framework end-to-end at the SDK layer (no +worker, no backend). Each prints the headline score plus a confusion +matrix table, so the math is inspectable rather than a passing-test +binary signal. + +Run:: + + cd packages/uipath + uv run python examples/dataset_evaluators_demo.py +""" + +from __future__ import annotations + +import json +from typing import Iterable + +from uipath.eval.evaluators._aggregator_specs import ( + FScoreAggregatorSpec, + PrecisionAggregatorSpec, + RecallAggregatorSpec, +) +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification +from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDetails, +) +from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator +from uipath.eval.models.models import ( + EvaluationResult, + EvaluationResultDto, + NumericEvaluationResult, +) + +# ─── helpers ────────────────────────────────────────────────────────────────── + + +def make_result(expected: str, actual: str) -> EvaluationResultDto: + """Build a single per-datapoint EvaluationResultDto. + + Models what an upstream classification evaluator would produce after running + on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the + expected/actual labels carried in the justification. + """ + score = 1.0 if expected.lower() == actual.lower() else 0.0 + justification = BaseEvaluatorJustification(expected=expected, actual=actual) + return EvaluationResultDto(score=score, details=justification.model_dump()) + + +def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]: + """Build a list of EvaluationResultDto from (expected, actual) pairs.""" + return [make_result(e, a) for e, a in pairs] + + +def print_header(title: str) -> None: + """Print a section header banner.""" + print() + print("═" * 78) + print(f" {title}") + print("═" * 78) + + +def print_confusion(details: ClassificationDetails) -> None: + """Pretty-print the confusion matrix as a table.""" + classes = details.classes + cell_width = max(7, max(len(c) for c in classes) + 1) + header = ( + " " * cell_width + + " │ " + + " │ ".join(c.center(cell_width) for c in classes) + + " │ ← expected" + ) + print(header) + print("─" * len(header)) + for predicted_idx, predicted_label in enumerate(classes): + row_cells = [ + str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width) + for expected_idx in range(len(classes)) + ] + print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │") + print(" " * cell_width + "↑ predicted") + + +def print_per_class(details: ClassificationDetails) -> None: + """One-row-per-class table of TP/TN/FP/FN + the metric.""" + label_w = max(len("class"), max(len(c) for c in details.classes)) + metric = details.metric + header = f" {'class'.ljust(label_w)} │ TP TN FP FN support {metric}" + print(header) + print(" " + "─" * (len(header) - 2)) + for cls, m in details.per_class.items(): + print( + f" {cls.ljust(label_w)} │ " + f"{m.tp:>2} {m.tn:>2} {m.fp:>2} {m.fn:>2} {m.support:>7} " + f"{m.value:.3f}" + ) + + +def report( + title: str, + result: EvaluationResult, + *, + show_json_tail: bool = False, +) -> None: + """Render one scenario's result block.""" + print_header(title) + assert isinstance(result, NumericEvaluationResult) + assert isinstance(result.details, ClassificationDetails) + d = result.details + print( + f" metric = {d.metric} average = {d.average} " + f"score (headline) = {result.score:.4f}" + ) + print( + f" micro = {d.micro:.4f} macro = {d.macro:.4f} " + f"scored = {d.n_scored}/{d.n_total} skipped = {d.n_skipped}" + ) + print() + print_confusion(d) + print() + print_per_class(d) + if show_json_tail: + print() + print(" ── wire JSON (matches frontend zod schema) ──") + payload = d.model_dump(by_alias=True) + print( + " " + + json.dumps( + {k: payload[k] for k in ("metric", "average", "micro", "macro")}, + indent=2, + ).replace("\n", "\n ") + ) + + +# ─── scenarios ──────────────────────────────────────────────────────────────── + + +def scenario_1_balanced_three_class() -> None: + """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong.""" + pairs = [ + ("book", "book"), + ("book", "book"), + ("book", "cancel"), + ("cancel", "cancel"), + ("cancel", "cancel"), + ("cancel", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "book"), + ] + spec = PrecisionAggregatorSpec( + classes=["book", "cancel", "reschedule"], averaging="macro" + ) + evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match") + report( + "Scenario 1 — Balanced 3-class (intent recognition)\n" + " Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.", + evaluator.evaluate(materialize_pairs(pairs)), + show_json_tail=True, + ) + + +def scenario_2_imbalanced_two_class() -> None: + """Rare-positive case — why macro vs micro matters.""" + pairs: list[tuple[str, str]] = [] + pairs += [("negative", "negative")] * 13 + pairs += [("negative", "positive")] * 3 + pairs += [("positive", "positive")] * 2 + pairs += [("positive", "negative")] * 2 + + results = materialize_pairs(pairs) + classes = ["positive", "negative"] + + macro = build_dataset_evaluator( + PrecisionAggregatorSpec(classes=classes, averaging="macro"), + source_evaluator="positive_match", + ) + micro = build_dataset_evaluator( + PrecisionAggregatorSpec(classes=classes, averaging="micro"), + source_evaluator="positive_match", + ) + report( + "Scenario 2a — Imbalanced 2-class, MACRO precision\n" + " Rare positive class. Macro averages per-class, so the rare class\n" + " having precision = 2/(2+3) = 0.40 drags the score down.", + macro.evaluate(results), + ) + report( + "Scenario 2b — Same data, MICRO precision\n" + " Pools TP/FP across classes. In a 2-class case this equals accuracy.", + micro.evaluate(results), + ) + + +def scenario_3_precision_vs_recall_vs_f() -> None: + """Same dataset, three different metrics — show they diverge on asymmetric data.""" + pairs = [ + ("yes", "yes"), + ("yes", "yes"), + ("no", "yes"), + ("no", "yes"), + ("no", "no"), + ("no", "no"), + ("yes", "no"), + ] + results = materialize_pairs(pairs) + classes = ["yes", "no"] + + evaluators = { + "Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator( + PrecisionAggregatorSpec(classes=classes, averaging="macro"), + source_evaluator="yes_match", + ), + "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator( + RecallAggregatorSpec(classes=classes, averaging="macro"), + source_evaluator="yes_match", + ), + "Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator( + FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0), + source_evaluator="yes_match", + ), + "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator( + FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0), + source_evaluator="yes_match", + ), + } + for title, evaluator in evaluators.items(): + report(title, evaluator.evaluate(results)) + + +def scenario_4_skipped_datapoints() -> None: + """Show how malformed / out-of-vocab data is reported, not silently dropped.""" + results = [ + make_result("cat", "cat"), + make_result("dog", "dog"), + make_result("cat", "platypus"), + make_result("zebra", "cat"), + EvaluationResultDto(score=1.0, details="bare string — no justification"), + EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), + ] + evaluator = build_dataset_evaluator( + PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"), + source_evaluator="any_match", + ) + report( + "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n" + " 6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n" + " report so you can tell whether a low score is a real signal or\n" + " just sparse data.", + evaluator.evaluate(results), + ) + + +def scenario_5_realistic_intent_classifier() -> None: + """A larger, more interesting 4-class dataset — uneven per-class performance.""" + pairs = [ + *[("book", "book")] * 10, + ("book", "cancel"), + *[("cancel", "cancel")] * 6, + ("cancel", "book"), + ("cancel", "modify"), + ("reschedule", "reschedule"), + ("reschedule", "reschedule"), + ("reschedule", "modify"), + ("reschedule", "modify"), + ("modify", "modify"), + ("modify", "reschedule"), + ] + results = materialize_pairs(pairs) + classes = ["book", "cancel", "reschedule", "modify"] + macro_f1 = build_dataset_evaluator( + FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0), + source_evaluator="intent_match", + ) + report( + "Scenario 5 — Realistic 4-class intent classifier\n" + " Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n" + " 'modify' weakness; micro F1 would have hidden it under 'book' wins.", + macro_f1.evaluate(results), + ) + + +def main() -> None: + """Run every scenario sequentially.""" + scenario_1_balanced_three_class() + scenario_2_imbalanced_two_class() + scenario_3_precision_vs_recall_vs_f() + scenario_4_skipped_datapoints() + scenario_5_realistic_intent_classifier() + print() + print("Done. All scenarios computed from real evaluator code.") + + +if __name__ == "__main__": + main() diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py new file mode 100644 index 000000000..6c0b2b880 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py @@ -0,0 +1,53 @@ +"""Aggregator specs embedded in per-datapoint classification evaluator configs. + +Each aggregator is a self-contained run-level metric (precision / recall / +f-score) attached to a classification evaluator. Specs do not share any +properties — each variant declares its own ``classes``, ``averaging``, and +(for fscore) ``f_value`` independently. This keeps each aggregator's contract +explicit at the JSON level: nothing is hoisted up to the evaluator and silently +applied to siblings. +""" + +from __future__ import annotations + +from typing import Annotated, Literal, Union + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + + +class _AggregatorSpecBase(BaseModel): + """Shared pydantic config for every aggregator variant.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + +class PrecisionAggregatorSpec(_AggregatorSpecBase): + """Run-level precision aggregator (multiclass, micro or macro averaged).""" + + type: Literal["precision"] = "precision" + classes: list[str] = Field(..., min_length=1) + averaging: Literal["macro", "micro"] + + +class RecallAggregatorSpec(_AggregatorSpecBase): + """Run-level recall aggregator (multiclass, micro or macro averaged).""" + + type: Literal["recall"] = "recall" + classes: list[str] = Field(..., min_length=1) + averaging: Literal["macro", "micro"] + + +class FScoreAggregatorSpec(_AggregatorSpecBase): + """Run-level F-beta aggregator (multiclass, micro or macro averaged).""" + + type: Literal["fscore"] = "fscore" + classes: list[str] = Field(..., min_length=1) + averaging: Literal["macro", "micro"] + f_value: float = Field(default=1.0, gt=0) + + +AggregatorSpec = Annotated[ + Union[PrecisionAggregatorSpec, RecallAggregatorSpec, FScoreAggregatorSpec], + Field(discriminator="type"), +] diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py new file mode 100644 index 000000000..c00eb666a --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py @@ -0,0 +1,56 @@ +"""Base abstractions for dataset-level evaluators. + +A dataset-level evaluator runs once per evaluation set, after all per-datapoint +evaluators have produced their results. It consumes the per-datapoint +EvaluationResultDto values from one named source evaluator and emits a single +EvaluationResult that summarizes the dataset. + +Unlike the earlier pointer-style design, dataset evaluators no longer carry +their own JSON config or a ``source_evaluator`` field. They are constructed by +the factory directly from an :class:`AggregatorSpec` embedded in a per-datapoint +classification evaluator's config, together with the source evaluator's name +which is supplied externally by the runtime when walking those configs. + +Concretely distinct from GenericBaseEvaluator: different evaluate() signature, +different lifecycle. Kept as a parallel hierarchy rather than a subclass so the +runtime cannot accidentally dispatch a dataset evaluator through the +per-datapoint loop. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from ..models.models import EvaluationResult, EvaluationResultDto +from ._aggregator_specs import AggregatorSpec + +SpecT = TypeVar("SpecT", bound="AggregatorSpec") + + +class BaseDatasetEvaluator(ABC, Generic[SpecT]): + """Abstract base for dataset-level evaluators. + + Constructed from an :class:`AggregatorSpec` and the name of the source + per-datapoint evaluator whose results this aggregator consumes. The + dataset evaluator's "name" used for result keying is derived from + ``"{source_evaluator}.{spec.type}"`` so two aggregators on the same source + don't collide. + """ + + spec: SpecT + source_evaluator: str + + def __init__(self, spec: SpecT, source_evaluator: str) -> None: + """Store the aggregator spec and the source evaluator name.""" + self.spec = spec + self.source_evaluator = source_evaluator + + @property + def name(self) -> str: + """Stable key for this dataset evaluator's result in the output map.""" + return f"{self.source_evaluator}.{self.spec.type}" + + @abstractmethod + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Reduce per-datapoint results into a single run-level EvaluationResult.""" diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py index d56509228..0a65c2c64 100644 --- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py @@ -19,6 +19,7 @@ UiPathEvaluationError, UiPathEvaluationErrorCategory, ) +from ._aggregator_specs import AggregatorSpec from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification from .output_evaluator import ( BaseOutputEvaluator, @@ -41,6 +42,12 @@ class BinaryClassificationEvaluatorConfig( positive_class: str metric_type: Literal["precision", "recall", "f-score"] = "precision" f_value: float = 1.0 + # Optional run-level aggregators (precision / recall / fscore). Each is a + # self-contained spec carrying its own ``classes``, ``averaging``, and + # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list + # after all per-datapoint evaluators complete and emits one structured + # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``. + aggregators: list[AggregatorSpec] | None = None class BinaryClassificationEvaluator( diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py new file mode 100644 index 000000000..70d74cd26 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py @@ -0,0 +1,208 @@ +"""Dataset-level classification evaluators: Precision, Recall, F-score. + +All three share the same internal machinery — a k x k confusion matrix built +from each per-datapoint result's BaseEvaluatorJustification (expected, actual) +strings. They differ only in the final formula and (for F-score) the beta +parameter. The headline ``score`` is the micro or macro average per the +embedded :class:`AggregatorSpec`; ``details`` carries the full per-class +breakdown plus the confusion matrix. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from pydantic import BaseModel, ConfigDict, Field +from pydantic.alias_generators import to_camel + +from ..models.models import ( + EvaluationResult, + EvaluationResultDto, + NumericEvaluationResult, +) +from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec +from .base_dataset_evaluator import BaseDatasetEvaluator +from .base_evaluator import BaseEvaluatorJustification + + +def _coerce_justification(details: object) -> BaseEvaluatorJustification | None: + """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload.""" + if isinstance(details, BaseEvaluatorJustification): + return details + if isinstance(details, dict): + try: + return BaseEvaluatorJustification.model_validate(details) + except Exception: + return None + return None + + +class PerClassMetrics(BaseModel): + """Per-class confusion counts plus the metric the evaluator computed.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + tp: int + tn: int + fp: int + fn: int + support: int + value: float + + +class ClassificationDetails(BaseModel): + """Structured details payload emitted by every classification evaluator.""" + + model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) + + metric: str + average: str + classes: list[str] + confusion_matrix: list[list[int]] = Field( + ..., + description=( + "k x k confusion matrix indexed as " + "``confusion_matrix[predicted_idx][expected_idx]`` " + "(rows are predicted classes, columns are expected). " + "This is the transpose of sklearn's convention " + "(``[true][predicted]``); UI / consumer code must use the " + "orientation documented here." + ), + ) + per_class: dict[str, PerClassMetrics] + micro: float + macro: float + n_total: int + n_scored: int + n_skipped: int + + +@dataclass(slots=True) +class _ConfusionData: + """Internal: confusion matrix and per-class counts derived from results.""" + + classes: list[str] + matrix: list[list[int]] + n_total: int + n_scored: int + n_skipped: int + + +def _build_confusion( + results: list[EvaluationResultDto], + classes: list[str], +) -> _ConfusionData: + """Build a confusion matrix from per-datapoint results. + + Results without a parseable justification are counted in ``n_skipped`` and + omitted from the matrix. Pairs whose expected or actual label isn't in + ``classes`` are also skipped. Labels are normalized to lowercase so a + classifier returning "Book" vs configured "book" still matches. + """ + canonical_classes = [c.lower() for c in classes] + index_of = {c: i for i, c in enumerate(canonical_classes)} + k = len(canonical_classes) + matrix = [[0] * k for _ in range(k)] + + n_total = len(results) + n_scored = 0 + n_skipped = 0 + + for r in results: + j = _coerce_justification(r.details) + if j is None: + n_skipped += 1 + continue + exp = j.expected.lower() + act = j.actual.lower() + if exp not in index_of or act not in index_of: + n_skipped += 1 + continue + matrix[index_of[act]][index_of[exp]] += 1 + n_scored += 1 + + return _ConfusionData( + classes=canonical_classes, + matrix=matrix, + n_total=n_total, + n_scored=n_scored, + n_skipped=n_skipped, + ) + + +class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]): + """One implementation for all three classification aggregators. + + Dispatches on ``self.spec.type`` to pick the per-class metric formula: + precision, recall, or F-beta. The math (confusion-matrix build, per-class + counts, micro/macro averaging) is identical across the three. + """ + + def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult: + """Compute the configured metric report and return the headline as score.""" + confusion = _build_confusion(results, self.spec.classes) + beta_sq = ( + self.spec.f_value * self.spec.f_value + if isinstance(self.spec, FScoreAggregatorSpec) + else 0.0 + ) + metric_type = self.spec.type + + per_class: dict[str, PerClassMetrics] = {} + total_tp = 0 + total_fp = 0 + total_fn = 0 + k = len(confusion.classes) + + for c, label in enumerate(confusion.classes): + tp = confusion.matrix[c][c] + fp = sum(confusion.matrix[c][j] for j in range(k)) - tp + fn = sum(confusion.matrix[j][c] for j in range(k)) - tp + tn = confusion.n_scored - tp - fp - fn + total_tp += tp + total_fp += fp + total_fn += fn + per_class[label] = PerClassMetrics( + tp=tp, + tn=tn, + fp=fp, + fn=fn, + support=tp + fn, + value=_metric(metric_type, tp, fp, fn, beta_sq), + ) + + micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq) + macro = sum(per_class[c].value for c in confusion.classes) / k + + details = ClassificationDetails( + metric=metric_type, + average=self.spec.averaging, + classes=confusion.classes, + confusion_matrix=confusion.matrix, + per_class=per_class, + micro=micro, + macro=macro, + n_total=confusion.n_total, + n_scored=confusion.n_scored, + n_skipped=confusion.n_skipped, + ) + + headline = micro if self.spec.averaging == "micro" else macro + return NumericEvaluationResult(score=headline, details=details) + + +def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> float: + """One formula switch covering precision / recall / F-beta.""" + if metric_type == "precision": + return tp / (tp + fp) if (tp + fp) > 0 else 0.0 + if metric_type == "recall": + return tp / (tp + fn) if (tp + fn) > 0 else 0.0 + if metric_type == "fscore": + p = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + r = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + denom = beta_sq * p + r + return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0 + raise ValueError( + f"Unknown metric_type: {metric_type!r}. " + "Expected one of: precision, recall, fscore." + ) diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py new file mode 100644 index 000000000..9cd895ad2 --- /dev/null +++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py @@ -0,0 +1,27 @@ +"""Factory that instantiates dataset-level evaluators from aggregator specs. + +Dataset evaluators are built from a self-contained :class:`AggregatorSpec` +embedded in a per-datapoint classification evaluator's config, plus the source +evaluator's name (supplied by the runtime when walking those configs). All +three aggregator types share a single :class:`ClassificationDatasetEvaluator` +implementation that dispatches on ``spec.type`` internally. +""" + +from __future__ import annotations + +from ._aggregator_specs import AggregatorSpec +from .classification_dataset_evaluators import ClassificationDatasetEvaluator + + +def build_dataset_evaluator( + spec: AggregatorSpec, + source_evaluator: str, +) -> ClassificationDatasetEvaluator: + """Build a dataset evaluator instance from an aggregator spec. + + Args: + spec: A validated :class:`AggregatorSpec` (precision / recall / fscore). + source_evaluator: Name of the per-datapoint evaluator whose results + this aggregator consumes. + """ + return ClassificationDatasetEvaluator(spec, source_evaluator) diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py index 69790c3aa..842d13174 100644 --- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py +++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py @@ -20,6 +20,7 @@ UiPathEvaluationError, UiPathEvaluationErrorCategory, ) +from ._aggregator_specs import AggregatorSpec from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification from .output_evaluator import ( BaseOutputEvaluator, @@ -43,6 +44,12 @@ class MulticlassClassificationEvaluatorConfig( metric_type: Literal["precision", "recall", "f-score"] = "f-score" averaging: Literal["micro", "macro"] = "macro" f_value: float = 1.0 + # Optional run-level aggregators (precision / recall / fscore). Each is a + # self-contained spec carrying its own ``classes``, ``averaging``, and + # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list + # after all per-datapoint evaluators complete and emits one structured + # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``. + aggregators: list[AggregatorSpec] | None = None class MulticlassClassificationEvaluator( diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py index 2aee5e599..fa84f0d9e 100644 --- a/packages/uipath/src/uipath/eval/runtime/_types.py +++ b/packages/uipath/src/uipath/eval/runtime/_types.py @@ -1,7 +1,7 @@ import logging from opentelemetry.sdk.trace import ReadableSpan -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from pydantic.alias_generators import to_camel from uipath.runtime import UiPathRuntimeResult @@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel): evaluation_set_name: str evaluation_set_results: list[UiPathEvalRunResult] + dataset_evaluator_results: dict[str, EvaluationResultDto] = Field( + default_factory=dict + ) @property def score(self) -> float: diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 7f7614446..7167d7f20 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -45,7 +45,15 @@ from uipath.runtime.schema import UiPathRuntimeSchema from .._execution_context import ExecutionSpanCollector +from ..evaluators._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec from ..evaluators.base_evaluator import GenericBaseEvaluator +from ..evaluators.binary_classification_evaluator import ( + BinaryClassificationEvaluatorConfig, +) +from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator +from ..evaluators.multiclass_classification_evaluator import ( + MulticlassClassificationEvaluatorConfig, +) from ..evaluators.output_evaluator import OutputEvaluationCriteria from ..helpers import get_agent_model from ..mocks._cache_manager import CacheManager @@ -202,6 +210,97 @@ def compute_evaluator_scores( return final_score, agg_metrics_per_evaluator +def compute_dataset_evaluator_results( + evaluation_set_results: list[UiPathEvalRunResult], + evaluators: Iterable[GenericBaseEvaluator[Any, Any, Any]], +) -> dict[str, EvaluationResultDto]: + """Run any dataset-level aggregators embedded in per-datapoint evaluator configs. + + Walks ``evaluators`` looking for any whose config carries an ``aggregators`` + list (currently only Binary/Multiclass classification). For each aggregator + spec, builds the corresponding dataset evaluator via the factory and runs it + over the per-datapoint results that came from that source evaluator. + + Args: + evaluation_set_results: Per-datapoint results from the run. + evaluators: Per-datapoint evaluator instances that ran during this eval + set. Their configs may carry ``aggregators`` lists. + + Returns: + Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level + EvaluationResultDto. When the same aggregator ``type`` appears more + than once on a source (e.g. macro+micro precision), each variant is + disambiguated as ``"{evaluator_name}.{type}.{averaging}"`` and, for + fscore, with the ``f_value`` suffix (``"...fbN"``), so a duplicate + type never overwrites a previous result. Aggregators whose source + produced no results are still invoked with an empty list so they emit + a zeroed result. + """ + results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict( + list + ) + for eval_run_result in evaluation_set_results: + for eval_run_result_dto in eval_run_result.evaluation_run_results: + if eval_run_result_dto.is_line_result: + continue + results_by_evaluator[eval_run_result_dto.evaluator_name].append( + eval_run_result_dto.result + ) + + dataset_results: dict[str, EvaluationResultDto] = {} + for evaluator in evaluators: + # Aggregators currently only live on classification evaluator configs. + # ``GenericBaseEvaluator`` doesn't declare ``evaluator_config``, so we + # retrieve it via ``getattr`` and narrow with ``isinstance`` to a + # classification config type before reading ``aggregators``. Widen the + # tuple if a future evaluator type grows an ``aggregators`` field. + config = getattr(evaluator, "evaluator_config", None) + if not isinstance( + config, + ( + BinaryClassificationEvaluatorConfig, + MulticlassClassificationEvaluatorConfig, + ), + ): + continue + if not config.aggregators: + continue + source_name = config.name + source_results = results_by_evaluator.get(source_name, []) + # Count occurrences of each aggregator type to detect duplicates + # (e.g. macro+micro precision on the same source). The default key + # shape ``{source}.{type}`` collides on duplicates; disambiguate with + # ``.{averaging}`` (and ``.fb{f_value}`` for fscore variants) only + # when more than one aggregator of that type exists, to preserve the + # simple key shape in the common case. + type_counts: dict[str, int] = defaultdict(int) + for spec in config.aggregators: + type_counts[spec.type] += 1 + for spec in config.aggregators: + dataset_evaluator = build_dataset_evaluator(spec, source_name) + key = _dataset_result_key(source_name, spec, type_counts[spec.type] > 1) + dataset_results[key] = EvaluationResultDto.from_evaluation_result( + dataset_evaluator.evaluate(source_results) + ) + return dataset_results + + +def _dataset_result_key( + source_name: str, spec: AggregatorSpec, disambiguate: bool +) -> str: + """Build the result-dict key for a dataset evaluator. + + Uses ``{source}.{type}`` for unique-type aggregators, and appends + ``.{averaging}`` (plus ``.fb{f_value}`` for fscore) when the same type + appears more than once on the same source. + """ + if not disambiguate: + return f"{source_name}.{spec.type}" + if isinstance(spec, FScoreAggregatorSpec): + return f"{source_name}.{spec.type}.{spec.averaging}.fb{spec.f_value}" + return f"{source_name}.{spec.type}.{spec.averaging}" + + class UiPathEvalRuntime: """Specialized runtime for evaluation runs, with access to the factory.""" @@ -381,6 +480,19 @@ async def execute(self) -> UiPathRuntimeResult: evaluators, ) + # Run any dataset-level aggregators embedded in per-datapoint + # classification evaluator configs (the ``aggregators`` list). + # Each aggregator consumes per-datapoint results from its + # parent evaluator and emits one run-level EvaluationResultDto + # keyed ``{evaluator_name}.{aggregator_type}`` on + # UiPathEvalOutput.dataset_evaluator_results. + results.dataset_evaluator_results = ( + compute_dataset_evaluator_results( + results.evaluation_set_results, + evaluators, + ) + ) + # Configure span with output and metadata await configure_eval_set_run_span( span=span, diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py new file mode 100644 index 000000000..e04a13fb0 --- /dev/null +++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py @@ -0,0 +1,560 @@ +"""Tests for dataset-level classification evaluators (Precision, Recall, FScore). + +Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases +(empty input, out-of-vocab labels, malformed details), factory dispatch, and +runtime-level routing where compute_dataset_evaluator_results walks +per-datapoint evaluator configs' embedded ``aggregators`` lists. +""" + +import uuid + +import pytest +from pydantic import BaseModel + +from uipath.eval.evaluators._aggregator_specs import ( + FScoreAggregatorSpec, + PrecisionAggregatorSpec, + RecallAggregatorSpec, +) +from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification +from uipath.eval.evaluators.classification_dataset_evaluators import ( + ClassificationDatasetEvaluator, + ClassificationDetails, +) +from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator +from uipath.eval.evaluators.multiclass_classification_evaluator import ( + MulticlassClassificationEvaluator, +) +from uipath.eval.models.models import ( + EvaluationResultDto, + NumericEvaluationResult, +) +from uipath.eval.runtime._types import ( + UiPathEvalRunResult, + UiPathEvalRunResultDto, +) +from uipath.eval.runtime.runtime import compute_dataset_evaluator_results + + +def _result( + expected: str, actual: str, score: float | None = None +) -> EvaluationResultDto: + """Build an EvaluationResultDto carrying an expected/actual justification.""" + if score is None: + score = 1.0 if expected.lower() == actual.lower() else 0.0 + justification = BaseEvaluatorJustification(expected=expected, actual=actual) + return EvaluationResultDto( + score=score, + details=justification.model_dump(), + ) + + +def _precision( + classes: list[str], averaging: str = "macro" +) -> ClassificationDatasetEvaluator: + spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging) # type: ignore[arg-type] + return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match") + + +def _recall( + classes: list[str], averaging: str = "macro" +) -> ClassificationDatasetEvaluator: + spec = RecallAggregatorSpec(classes=classes, averaging=averaging) # type: ignore[arg-type] + return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match") + + +def _fscore( + classes: list[str], averaging: str = "macro", f_value: float = 1.0 +) -> ClassificationDatasetEvaluator: + spec = FScoreAggregatorSpec( + classes=classes, + averaging=averaging, # type: ignore[arg-type] + f_value=f_value, + ) + return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match") + + +def _details(result: object) -> ClassificationDetails: + """Type-narrowing helper for asserting on details.""" + assert isinstance(result, NumericEvaluationResult) + assert isinstance(result.details, ClassificationDetails) + return result.details + + +def _multiclass_evaluator( + name: str, + classes: list[str], + aggregators: list[BaseModel], +) -> MulticlassClassificationEvaluator: + """Build a per-datapoint multiclass evaluator with embedded aggregators.""" + return MulticlassClassificationEvaluator.model_validate( + { + "id": str(uuid.uuid4()), + "evaluatorConfig": { + "name": name, + "classes": classes, + "aggregators": [spec.model_dump(by_alias=True) for spec in aggregators], + }, + } + ) + + +class TestPrecisionEvaluator: + def test_empty_input_returns_zeroed_result(self) -> None: + result = _precision(["cat", "dog"]).evaluate([]) + assert isinstance(result, NumericEvaluationResult) + assert result.score == 0.0 + d = _details(result) + assert d.n_total == 0 and d.n_scored == 0 + assert d.confusion_matrix == [[0, 0], [0, 0]] + assert d.per_class["cat"].tp == 0 + assert d.per_class["cat"].tn == 0 + + def test_confusion_matrix_is_predicted_by_expected(self) -> None: + # Pin the documented orientation: confusion_matrix[predicted][expected]. + # Differs from sklearn's [true][predicted] convention. + results = [ + _result("cat", "cat"), # expected=cat, predicted=cat -> [cat][cat] + _result("cat", "dog"), # expected=cat, predicted=dog -> [dog][cat] + _result("dog", "dog"), # expected=dog, predicted=dog -> [dog][dog] + _result("dog", "dog"), + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + # classes -> index: cat=0, dog=1 + # [predicted=cat][expected=cat] = 1 + assert d.confusion_matrix[0][0] == 1 + # [predicted=dog][expected=cat] = 1 (the FP for dog / FN for cat) + assert d.confusion_matrix[1][0] == 1 + # [predicted=dog][expected=dog] = 2 + assert d.confusion_matrix[1][1] == 2 + # [predicted=cat][expected=dog] = 0 + assert d.confusion_matrix[0][1] == 0 + + def test_precision_two_class_macro(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _precision(["yes", "no"], averaging="macro").evaluate(results) + d = _details(result) + # precision_yes = 2 / (2 + 1) = 2/3 + # precision_no = 0 / (0 + 1) = 0 + # macro = (2/3 + 0) / 2 = 1/3 + assert d.per_class["yes"].value == pytest.approx(2 / 3) + assert d.per_class["no"].value == pytest.approx(0.0) + assert d.macro == pytest.approx((2 / 3 + 0.0) / 2) + assert result.score == pytest.approx(d.macro) + + def test_two_class_micro_equals_accuracy(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _precision(["yes", "no"], averaging="micro").evaluate(results) + d = _details(result) + assert d.micro == pytest.approx(0.5) + assert result.score == pytest.approx(0.5) + + def test_three_class_macro(self) -> None: + pairs = [ + ("cat", "cat"), + ("cat", "cat"), + ("cat", "dog"), + ("dog", "dog"), + ("dog", "dog"), + ("dog", "bird"), + ("bird", "bird"), + ("bird", "bird"), + ("bird", "cat"), + ] + result = _precision(["cat", "dog", "bird"], averaging="macro").evaluate( + [_result(e, a) for e, a in pairs] + ) + d = _details(result) + for label in ("cat", "dog", "bird"): + m = d.per_class[label] + assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5 + assert m.value == pytest.approx(2 / 3) + assert d.macro == pytest.approx(2 / 3) + assert result.score == pytest.approx(2 / 3) + + +class TestRecallEvaluator: + def test_recall_two_class_macro(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + result = _recall(["yes", "no"], averaging="macro").evaluate(results) + d = _details(result) + assert d.per_class["yes"].value == pytest.approx(2 / 3) + assert d.per_class["no"].value == pytest.approx(0.0) + assert result.score == pytest.approx(1 / 3) + + def test_recall_differs_from_precision(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("no", "yes"), + _result("no", "yes"), + _result("no", "no"), + ] + p = _details(_precision(["yes", "no"], averaging="macro").evaluate(results)) + r = _details(_recall(["yes", "no"], averaging="macro").evaluate(results)) + assert p.per_class["yes"].value == pytest.approx(0.5) + assert p.per_class["no"].value == pytest.approx(1.0) + assert r.per_class["yes"].value == pytest.approx(1.0) + assert r.per_class["no"].value == pytest.approx(1 / 3) + + +class TestFScoreEvaluator: + def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("yes", "no"), + _result("no", "yes"), + ] + f = _details( + _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results) + ) + assert f.per_class["yes"].value == pytest.approx(2 / 3) + assert f.per_class["no"].value == pytest.approx(0.0) + assert f.macro == pytest.approx((2 / 3 + 0.0) / 2) + + def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None: + results = [ + _result("yes", "yes"), + _result("yes", "yes"), + _result("no", "yes"), + _result("no", "yes"), + _result("no", "no"), + ] + f1 = _details( + _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results) + ) + f2 = _details( + _fscore(["yes", "no"], averaging="macro", f_value=2.0).evaluate(results) + ) + assert f2.per_class["yes"].value > f1.per_class["yes"].value + + def test_three_class_micro_pools_across_classes(self) -> None: + pairs = [ + ("cat", "cat"), + ("cat", "cat"), + ("cat", "dog"), + ("dog", "dog"), + ("dog", "dog"), + ("dog", "bird"), + ("bird", "bird"), + ("bird", "bird"), + ("bird", "cat"), + ] + d = _details( + _fscore(["cat", "dog", "bird"], averaging="micro", f_value=1.0).evaluate( + [_result(e, a) for e, a in pairs] + ) + ) + assert d.micro == pytest.approx(6 / 9) + + +class TestSkippingAndEdgeCases: + def test_out_of_vocab_labels_are_skipped(self) -> None: + results = [ + _result("cat", "cat"), + _result("cat", "platypus"), + _result("zebra", "dog"), + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 + + def test_results_without_justification_are_skipped(self) -> None: + results = [ + _result("cat", "cat"), + EvaluationResultDto(score=1.0, details="just a string"), + EvaluationResultDto(score=0.0, details={"unrelated": "shape"}), + ] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2 + + def test_case_insensitive(self) -> None: + results = [_result("Cat", "CAT"), _result("DOG", "dog")] + d = _details(_precision(["cat", "dog"]).evaluate(results)) + assert d.per_class["cat"].tp == 1 + assert d.per_class["dog"].tp == 1 + + +class TestFactory: + """The factory now takes an AggregatorSpec instance + source name, not a dict.""" + + def test_builds_precision_from_spec(self) -> None: + spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro") + evaluator = build_dataset_evaluator(spec, "intent_match") + assert isinstance(evaluator, ClassificationDatasetEvaluator) + assert evaluator.spec.type == "precision" + assert evaluator.source_evaluator == "intent_match" + assert evaluator.name == "intent_match.precision" + + def test_builds_recall_from_spec(self) -> None: + spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro") + evaluator = build_dataset_evaluator(spec, "intent_match") + assert isinstance(evaluator, ClassificationDatasetEvaluator) + assert evaluator.spec.type == "recall" + assert evaluator.name == "intent_match.recall" + + def test_builds_fscore_from_spec(self) -> None: + spec = FScoreAggregatorSpec( + classes=["yes", "no"], averaging="macro", f_value=2.0 + ) + evaluator = build_dataset_evaluator(spec, "intent_match") + assert isinstance(evaluator, ClassificationDatasetEvaluator) + assert isinstance(evaluator.spec, FScoreAggregatorSpec) + assert evaluator.spec.f_value == 2.0 + + +class TestAggregatorSpecJsonRoundTrip: + """Pin the wire shape sent to the C# side.""" + + def test_precision_uses_self_contained_fields(self) -> None: + spec = PrecisionAggregatorSpec.model_validate( + { + "type": "precision", + "classes": ["book", "cancel", "reschedule"], + "averaging": "macro", + } + ) + dumped = spec.model_dump(by_alias=True) + assert dumped == { + "type": "precision", + "classes": ["book", "cancel", "reschedule"], + "averaging": "macro", + } + + def test_fscore_uses_camelcase_fvalue_on_wire(self) -> None: + spec = FScoreAggregatorSpec.model_validate( + { + "type": "fscore", + "classes": ["yes", "no"], + "averaging": "macro", + "fValue": 1.5, + } + ) + assert spec.f_value == 1.5 + dumped = spec.model_dump(by_alias=True) + assert dumped["fValue"] == 1.5 + assert "f_value" not in dumped + + def test_multiclass_evaluator_round_trips_aggregators(self) -> None: + """Per-datapoint evaluator config carries aggregators[]; survives dump+load.""" + ev = _multiclass_evaluator( + "intent_classifier", + classes=["book", "cancel", "reschedule"], + aggregators=[ + PrecisionAggregatorSpec( + classes=["book", "cancel", "reschedule"], averaging="macro" + ), + FScoreAggregatorSpec( + classes=["book", "cancel", "reschedule"], + averaging="macro", + f_value=1.0, + ), + ], + ) + assert ev.evaluator_config.aggregators is not None + assert len(ev.evaluator_config.aggregators) == 2 + assert ev.evaluator_config.aggregators[0].type == "precision" + assert ev.evaluator_config.aggregators[1].type == "fscore" + + +class TestComputeDatasetEvaluatorResults: + """End-to-end: runtime walks evaluator configs' aggregators[].""" + + def test_walks_aggregators_on_classification_evaluator(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + RecallAggregatorSpec(classes=["yes", "no"], averaging="macro"), + ], + ) + + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + UiPathEvalRunResultDto( + evaluator_name="some_other_evaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=0.5), + ), + ], + ), + UiPathEvalRunResult( + evaluation_name="dp2", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "no"), + ), + ], + ), + ] + + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + # Two aggregators on intent_match → two keys, prefixed by source name. + assert set(out) == {"intent_match.precision", "intent_match.recall"} + precision_dto = out["intent_match.precision"] + assert isinstance(precision_dto, EvaluationResultDto) + assert isinstance(precision_dto.details, dict) + # The unrelated 0.5 score from some_other_evaluator must NOT be in the matrix. + assert precision_dto.details["n_scored"] == 2 + + def test_evaluator_without_aggregators_is_skipped(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", classes=["yes", "no"], aggregators=[] + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + assert out == {} + + def test_line_by_line_subresults_are_excluded(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + ], + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + is_line_result=True, + ), + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("no", "no"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + assert isinstance(out["intent_match.precision"].details, dict) + assert out["intent_match.precision"].details["n_scored"] == 1 + + def test_source_with_no_results_produces_zeroed_report(self) -> None: + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + ], + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="some_other_evaluator", + evaluator_id=str(uuid.uuid4()), + result=EvaluationResultDto(score=1.0), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + dto = out["intent_match.precision"] + assert dto.score == 0.0 + assert isinstance(dto.details, dict) + assert dto.details["n_scored"] == 0 + + def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None: + """Two aggregators of the same type get distinct keys (no overwrite).""" + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"), + PrecisionAggregatorSpec(classes=["yes", "no"], averaging="micro"), + ], + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + # Same type appears twice → averaging suffix disambiguates so neither + # is silently overwritten. + assert set(out) == { + "intent_match.precision.macro", + "intent_match.precision.micro", + } + + def test_duplicate_fscore_disambiguates_by_averaging_and_fvalue(self) -> None: + """Two FScore aggregators (e.g. F1 macro and F2 macro) both survive.""" + evaluator = _multiclass_evaluator( + "intent_match", + classes=["yes", "no"], + aggregators=[ + FScoreAggregatorSpec( + classes=["yes", "no"], averaging="macro", f_value=1.0 + ), + FScoreAggregatorSpec( + classes=["yes", "no"], averaging="macro", f_value=2.0 + ), + ], + ) + eval_results = [ + UiPathEvalRunResult( + evaluation_name="dp1", + evaluation_run_results=[ + UiPathEvalRunResultDto( + evaluator_name="intent_match", + evaluator_id=str(uuid.uuid4()), + result=_result("yes", "yes"), + ), + ], + ), + ] + out = compute_dataset_evaluator_results(eval_results, [evaluator]) + assert set(out) == { + "intent_match.fscore.macro.fb1.0", + "intent_match.fscore.macro.fb2.0", + }