diff --git a/packages/uipath/examples/dataset_evaluators_demo.py b/packages/uipath/examples/dataset_evaluators_demo.py
new file mode 100644
index 000000000..1a3c376c0
--- /dev/null
+++ b/packages/uipath/examples/dataset_evaluators_demo.py
@@ -0,0 +1,295 @@
+"""Runnable proof that the dataset-level evaluators work on realistic data.
+
+Five scenarios exercise the framework end-to-end at the SDK layer (no
+worker, no backend). Each prints the headline score plus a confusion
+matrix table, so the math is inspectable rather than a passing-test
+binary signal.
+
+Run::
+
+    cd packages/uipath
+    uv run python examples/dataset_evaluators_demo.py
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Iterable
+
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDetails,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+
+# ─── helpers ──────────────────────────────────────────────────────────────────
+
+
+def make_result(expected: str, actual: str) -> EvaluationResultDto:
+    """Build a single per-datapoint EvaluationResultDto.
+
+    Models what an upstream classification evaluator would produce after running
+    on one datapoint: score is 1.0 if the labels match, 0.0 otherwise, with the
+    expected/actual labels carried in the justification.
+    """
+    score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(score=score, details=justification.model_dump())
+
+
+def materialize_pairs(pairs: Iterable[tuple[str, str]]) -> list[EvaluationResultDto]:
+    """Build a list of EvaluationResultDto from (expected, actual) pairs."""
+    return [make_result(e, a) for e, a in pairs]
+
+
+def print_header(title: str) -> None:
+    """Print a section header banner."""
+    print()
+    print("═" * 78)
+    print(f" {title}")
+    print("═" * 78)
+
+
+def print_confusion(details: ClassificationDetails) -> None:
+    """Pretty-print the confusion matrix as a table."""
+    classes = details.classes
+    cell_width = max(7, max(len(c) for c in classes) + 1)
+    header = (
+        " " * cell_width
+        + " │ "
+        + " │ ".join(c.center(cell_width) for c in classes)
+        + " │  ← expected"
+    )
+    print(header)
+    print("─" * len(header))
+    for predicted_idx, predicted_label in enumerate(classes):
+        row_cells = [
+            str(details.confusion_matrix[predicted_idx][expected_idx]).rjust(cell_width)
+            for expected_idx in range(len(classes))
+        ]
+        print(predicted_label.ljust(cell_width) + " │ " + " │ ".join(row_cells) + " │")
+    print(" " * cell_width + "↑ predicted")
+
+
+def print_per_class(details: ClassificationDetails) -> None:
+    """One-row-per-class table of TP/TN/FP/FN + the metric."""
+    label_w = max(len("class"), max(len(c) for c in details.classes))
+    metric = details.metric
+    header = f"  {'class'.ljust(label_w)}  │  TP  TN  FP  FN  support  {metric}"
+    print(header)
+    print("  " + "─" * (len(header) - 2))
+    for cls, m in details.per_class.items():
+        print(
+            f"  {cls.ljust(label_w)}  │  "
+            f"{m.tp:>2}  {m.tn:>2}  {m.fp:>2}  {m.fn:>2}  {m.support:>7}  "
+            f"{m.value:.3f}"
+        )
+
+
+def report(
+    title: str,
+    result: EvaluationResult,
+    *,
+    show_json_tail: bool = False,
+) -> None:
+    """Render one scenario's result block."""
+    print_header(title)
+    assert isinstance(result, NumericEvaluationResult)
+    assert isinstance(result.details, ClassificationDetails)
+    d = result.details
+    print(
+        f"  metric = {d.metric}   average = {d.average}   "
+        f"score (headline) = {result.score:.4f}"
+    )
+    print(
+        f"  micro = {d.micro:.4f}   macro = {d.macro:.4f}   "
+        f"scored = {d.n_scored}/{d.n_total}   skipped = {d.n_skipped}"
+    )
+    print()
+    print_confusion(d)
+    print()
+    print_per_class(d)
+    if show_json_tail:
+        print()
+        print("  ── wire JSON (matches frontend zod schema) ──")
+        payload = d.model_dump(by_alias=True)
+        print(
+            "  "
+            + json.dumps(
+                {k: payload[k] for k in ("metric", "average", "micro", "macro")},
+                indent=2,
+            ).replace("\n", "\n  ")
+        )
+
+
+# ─── scenarios ────────────────────────────────────────────────────────────────
+
+
+def scenario_1_balanced_three_class() -> None:
+    """Intent recognition over book/cancel/reschedule. Every class gets 2 right, 1 wrong."""
+    pairs = [
+        ("book", "book"),
+        ("book", "book"),
+        ("book", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "cancel"),
+        ("cancel", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "book"),
+    ]
+    spec = PrecisionAggregatorSpec(
+        classes=["book", "cancel", "reschedule"], averaging="macro"
+    )
+    evaluator = build_dataset_evaluator(spec, source_evaluator="intent_match")
+    report(
+        "Scenario 1 — Balanced 3-class (intent recognition)\n"
+        "  Each class: 2 TP, 1 FP, 1 FN. Symmetric setup → macro = micro = 2/3.",
+        evaluator.evaluate(materialize_pairs(pairs)),
+        show_json_tail=True,
+    )
+
+
+def scenario_2_imbalanced_two_class() -> None:
+    """Rare-positive case — why macro vs micro matters."""
+    pairs: list[tuple[str, str]] = []
+    pairs += [("negative", "negative")] * 13
+    pairs += [("negative", "positive")] * 3
+    pairs += [("positive", "positive")] * 2
+    pairs += [("positive", "negative")] * 2
+
+    results = materialize_pairs(pairs)
+    classes = ["positive", "negative"]
+
+    macro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+        source_evaluator="positive_match",
+    )
+    micro = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=classes, averaging="micro"),
+        source_evaluator="positive_match",
+    )
+    report(
+        "Scenario 2a — Imbalanced 2-class, MACRO precision\n"
+        "  Rare positive class. Macro averages per-class, so the rare class\n"
+        "  having precision = 2/(2+3) = 0.40 drags the score down.",
+        macro.evaluate(results),
+    )
+    report(
+        "Scenario 2b — Same data, MICRO precision\n"
+        "  Pools TP/FP across classes. In a 2-class case this equals accuracy.",
+        micro.evaluate(results),
+    )
+
+
+def scenario_3_precision_vs_recall_vs_f() -> None:
+    """Same dataset, three different metrics — show they diverge on asymmetric data."""
+    pairs = [
+        ("yes", "yes"),
+        ("yes", "yes"),
+        ("no", "yes"),
+        ("no", "yes"),
+        ("no", "no"),
+        ("no", "no"),
+        ("yes", "no"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["yes", "no"]
+
+    evaluators = {
+        "Scenario 3a — Precision on a recall-favourable dataset": build_dataset_evaluator(
+            PrecisionAggregatorSpec(classes=classes, averaging="macro"),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3b — Recall (same data — note 'yes' recall is 1.0)": build_dataset_evaluator(
+            RecallAggregatorSpec(classes=classes, averaging="macro"),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3c — F1 (harmonic mean of P and R)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+            source_evaluator="yes_match",
+        ),
+        "Scenario 3d — F2 (β=2 weighs recall higher — score moves toward recall)": build_dataset_evaluator(
+            FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=2.0),
+            source_evaluator="yes_match",
+        ),
+    }
+    for title, evaluator in evaluators.items():
+        report(title, evaluator.evaluate(results))
+
+
+def scenario_4_skipped_datapoints() -> None:
+    """Show how malformed / out-of-vocab data is reported, not silently dropped."""
+    results = [
+        make_result("cat", "cat"),
+        make_result("dog", "dog"),
+        make_result("cat", "platypus"),
+        make_result("zebra", "cat"),
+        EvaluationResultDto(score=1.0, details="bare string — no justification"),
+        EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+    ]
+    evaluator = build_dataset_evaluator(
+        PrecisionAggregatorSpec(classes=["cat", "dog"], averaging="macro"),
+        source_evaluator="any_match",
+    )
+    report(
+        "Scenario 4 — Skipped datapoints (out-of-vocab + malformed details)\n"
+        "  6 datapoints in, 2 scored, 4 skipped. Skip counts surface in the\n"
+        "  report so you can tell whether a low score is a real signal or\n"
+        "  just sparse data.",
+        evaluator.evaluate(results),
+    )
+
+
+def scenario_5_realistic_intent_classifier() -> None:
+    """A larger, more interesting 4-class dataset — uneven per-class performance."""
+    pairs = [
+        *[("book", "book")] * 10,
+        ("book", "cancel"),
+        *[("cancel", "cancel")] * 6,
+        ("cancel", "book"),
+        ("cancel", "modify"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "reschedule"),
+        ("reschedule", "modify"),
+        ("reschedule", "modify"),
+        ("modify", "modify"),
+        ("modify", "reschedule"),
+    ]
+    results = materialize_pairs(pairs)
+    classes = ["book", "cancel", "reschedule", "modify"]
+    macro_f1 = build_dataset_evaluator(
+        FScoreAggregatorSpec(classes=classes, averaging="macro", f_value=1.0),
+        source_evaluator="intent_match",
+    )
+    report(
+        "Scenario 5 — Realistic 4-class intent classifier\n"
+        "  Uneven per-class performance. Macro F1 surfaces 'reschedule' and\n"
+        "  'modify' weakness; micro F1 would have hidden it under 'book' wins.",
+        macro_f1.evaluate(results),
+    )
+
+
+def main() -> None:
+    """Run every scenario sequentially."""
+    scenario_1_balanced_three_class()
+    scenario_2_imbalanced_two_class()
+    scenario_3_precision_vs_recall_vs_f()
+    scenario_4_skipped_datapoints()
+    scenario_5_realistic_intent_classifier()
+    print()
+    print("Done. All scenarios computed from real evaluator code.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
new file mode 100644
index 000000000..6c0b2b880
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/_aggregator_specs.py
@@ -0,0 +1,53 @@
+"""Aggregator specs embedded in per-datapoint classification evaluator configs.
+
+Each aggregator is a self-contained run-level metric (precision / recall /
+f-score) attached to a classification evaluator. Specs do not share any
+properties — each variant declares its own ``classes``, ``averaging``, and
+(for fscore) ``f_value`` independently. This keeps each aggregator's contract
+explicit at the JSON level: nothing is hoisted up to the evaluator and silently
+applied to siblings.
+"""
+
+from __future__ import annotations
+
+from typing import Annotated, Literal, Union
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+
+class _AggregatorSpecBase(BaseModel):
+    """Shared pydantic config for every aggregator variant."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+
+class PrecisionAggregatorSpec(_AggregatorSpecBase):
+    """Run-level precision aggregator (multiclass, micro or macro averaged)."""
+
+    type: Literal["precision"] = "precision"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+
+
+class RecallAggregatorSpec(_AggregatorSpecBase):
+    """Run-level recall aggregator (multiclass, micro or macro averaged)."""
+
+    type: Literal["recall"] = "recall"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+
+
+class FScoreAggregatorSpec(_AggregatorSpecBase):
+    """Run-level F-beta aggregator (multiclass, micro or macro averaged)."""
+
+    type: Literal["fscore"] = "fscore"
+    classes: list[str] = Field(..., min_length=1)
+    averaging: Literal["macro", "micro"]
+    f_value: float = Field(default=1.0, gt=0)
+
+
+AggregatorSpec = Annotated[
+    Union[PrecisionAggregatorSpec, RecallAggregatorSpec, FScoreAggregatorSpec],
+    Field(discriminator="type"),
+]
diff --git a/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
new file mode 100644
index 000000000..c00eb666a
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/base_dataset_evaluator.py
@@ -0,0 +1,56 @@
+"""Base abstractions for dataset-level evaluators.
+
+A dataset-level evaluator runs once per evaluation set, after all per-datapoint
+evaluators have produced their results. It consumes the per-datapoint
+EvaluationResultDto values from one named source evaluator and emits a single
+EvaluationResult that summarizes the dataset.
+
+Unlike the earlier pointer-style design, dataset evaluators no longer carry
+their own JSON config or a ``source_evaluator`` field. They are constructed by
+the factory directly from an :class:`AggregatorSpec` embedded in a per-datapoint
+classification evaluator's config, together with the source evaluator's name
+which is supplied externally by the runtime when walking those configs.
+
+Concretely distinct from GenericBaseEvaluator: different evaluate() signature,
+different lifecycle. Kept as a parallel hierarchy rather than a subclass so the
+runtime cannot accidentally dispatch a dataset evaluator through the
+per-datapoint loop.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+
+from ..models.models import EvaluationResult, EvaluationResultDto
+from ._aggregator_specs import AggregatorSpec
+
+SpecT = TypeVar("SpecT", bound="AggregatorSpec")
+
+
+class BaseDatasetEvaluator(ABC, Generic[SpecT]):
+    """Abstract base for dataset-level evaluators.
+
+    Constructed from an :class:`AggregatorSpec` and the name of the source
+    per-datapoint evaluator whose results this aggregator consumes. The
+    dataset evaluator's "name" used for result keying is derived from
+    ``"{source_evaluator}.{spec.type}"`` so two aggregators on the same source
+    don't collide.
+    """
+
+    spec: SpecT
+    source_evaluator: str
+
+    def __init__(self, spec: SpecT, source_evaluator: str) -> None:
+        """Store the aggregator spec and the source evaluator name."""
+        self.spec = spec
+        self.source_evaluator = source_evaluator
+
+    @property
+    def name(self) -> str:
+        """Stable key for this dataset evaluator's result in the output map."""
+        return f"{self.source_evaluator}.{self.spec.type}"
+
+    @abstractmethod
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Reduce per-datapoint results into a single run-level EvaluationResult."""
diff --git a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
index d56509228..0a65c2c64 100644
--- a/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/binary_classification_evaluator.py
@@ -19,6 +19,7 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
+from ._aggregator_specs import AggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
@@ -41,6 +42,12 @@ class BinaryClassificationEvaluatorConfig(
     positive_class: str
     metric_type: Literal["precision", "recall", "f-score"] = "precision"
     f_value: float = 1.0
+    # Optional run-level aggregators (precision / recall / fscore). Each is a
+    # self-contained spec carrying its own ``classes``, ``averaging``, and
+    # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list
+    # after all per-datapoint evaluators complete and emits one structured
+    # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
+    aggregators: list[AggregatorSpec] | None = None
 
 
 class BinaryClassificationEvaluator(
diff --git a/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
new file mode 100644
index 000000000..70d74cd26
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/classification_dataset_evaluators.py
@@ -0,0 +1,208 @@
+"""Dataset-level classification evaluators: Precision, Recall, F-score.
+
+All three share the same internal machinery — a k x k confusion matrix built
+from each per-datapoint result's BaseEvaluatorJustification (expected, actual)
+strings. They differ only in the final formula and (for F-score) the beta
+parameter. The headline ``score`` is the micro or macro average per the
+embedded :class:`AggregatorSpec`; ``details`` carries the full per-class
+breakdown plus the confusion matrix.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pydantic import BaseModel, ConfigDict, Field
+from pydantic.alias_generators import to_camel
+
+from ..models.models import (
+    EvaluationResult,
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+from ._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
+from .base_dataset_evaluator import BaseDatasetEvaluator
+from .base_evaluator import BaseEvaluatorJustification
+
+
+def _coerce_justification(details: object) -> BaseEvaluatorJustification | None:
+    """Extract the BaseEvaluatorJustification from an EvaluationResultDto.details payload."""
+    if isinstance(details, BaseEvaluatorJustification):
+        return details
+    if isinstance(details, dict):
+        try:
+            return BaseEvaluatorJustification.model_validate(details)
+        except Exception:
+            return None
+    return None
+
+
+class PerClassMetrics(BaseModel):
+    """Per-class confusion counts plus the metric the evaluator computed."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    tp: int
+    tn: int
+    fp: int
+    fn: int
+    support: int
+    value: float
+
+
+class ClassificationDetails(BaseModel):
+    """Structured details payload emitted by every classification evaluator."""
+
+    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
+
+    metric: str
+    average: str
+    classes: list[str]
+    confusion_matrix: list[list[int]] = Field(
+        ...,
+        description=(
+            "k x k confusion matrix indexed as "
+            "``confusion_matrix[predicted_idx][expected_idx]`` "
+            "(rows are predicted classes, columns are expected). "
+            "This is the transpose of sklearn's convention "
+            "(``[true][predicted]``); UI / consumer code must use the "
+            "orientation documented here."
+        ),
+    )
+    per_class: dict[str, PerClassMetrics]
+    micro: float
+    macro: float
+    n_total: int
+    n_scored: int
+    n_skipped: int
+
+
+@dataclass(slots=True)
+class _ConfusionData:
+    """Internal: confusion matrix and per-class counts derived from results."""
+
+    classes: list[str]
+    matrix: list[list[int]]
+    n_total: int
+    n_scored: int
+    n_skipped: int
+
+
+def _build_confusion(
+    results: list[EvaluationResultDto],
+    classes: list[str],
+) -> _ConfusionData:
+    """Build a confusion matrix from per-datapoint results.
+
+    Results without a parseable justification are counted in ``n_skipped`` and
+    omitted from the matrix. Pairs whose expected or actual label isn't in
+    ``classes`` are also skipped. Labels are normalized to lowercase so a
+    classifier returning "Book" vs configured "book" still matches.
+    """
+    canonical_classes = [c.lower() for c in classes]
+    index_of = {c: i for i, c in enumerate(canonical_classes)}
+    k = len(canonical_classes)
+    matrix = [[0] * k for _ in range(k)]
+
+    n_total = len(results)
+    n_scored = 0
+    n_skipped = 0
+
+    for r in results:
+        j = _coerce_justification(r.details)
+        if j is None:
+            n_skipped += 1
+            continue
+        exp = j.expected.lower()
+        act = j.actual.lower()
+        if exp not in index_of or act not in index_of:
+            n_skipped += 1
+            continue
+        matrix[index_of[act]][index_of[exp]] += 1
+        n_scored += 1
+
+    return _ConfusionData(
+        classes=canonical_classes,
+        matrix=matrix,
+        n_total=n_total,
+        n_scored=n_scored,
+        n_skipped=n_skipped,
+    )
+
+
+class ClassificationDatasetEvaluator(BaseDatasetEvaluator[AggregatorSpec]):
+    """One implementation for all three classification aggregators.
+
+    Dispatches on ``self.spec.type`` to pick the per-class metric formula:
+    precision, recall, or F-beta. The math (confusion-matrix build, per-class
+    counts, micro/macro averaging) is identical across the three.
+    """
+
+    def evaluate(self, results: list[EvaluationResultDto]) -> EvaluationResult:
+        """Compute the configured metric report and return the headline as score."""
+        confusion = _build_confusion(results, self.spec.classes)
+        beta_sq = (
+            self.spec.f_value * self.spec.f_value
+            if isinstance(self.spec, FScoreAggregatorSpec)
+            else 0.0
+        )
+        metric_type = self.spec.type
+
+        per_class: dict[str, PerClassMetrics] = {}
+        total_tp = 0
+        total_fp = 0
+        total_fn = 0
+        k = len(confusion.classes)
+
+        for c, label in enumerate(confusion.classes):
+            tp = confusion.matrix[c][c]
+            fp = sum(confusion.matrix[c][j] for j in range(k)) - tp
+            fn = sum(confusion.matrix[j][c] for j in range(k)) - tp
+            tn = confusion.n_scored - tp - fp - fn
+            total_tp += tp
+            total_fp += fp
+            total_fn += fn
+            per_class[label] = PerClassMetrics(
+                tp=tp,
+                tn=tn,
+                fp=fp,
+                fn=fn,
+                support=tp + fn,
+                value=_metric(metric_type, tp, fp, fn, beta_sq),
+            )
+
+        micro = _metric(metric_type, total_tp, total_fp, total_fn, beta_sq)
+        macro = sum(per_class[c].value for c in confusion.classes) / k
+
+        details = ClassificationDetails(
+            metric=metric_type,
+            average=self.spec.averaging,
+            classes=confusion.classes,
+            confusion_matrix=confusion.matrix,
+            per_class=per_class,
+            micro=micro,
+            macro=macro,
+            n_total=confusion.n_total,
+            n_scored=confusion.n_scored,
+            n_skipped=confusion.n_skipped,
+        )
+
+        headline = micro if self.spec.averaging == "micro" else macro
+        return NumericEvaluationResult(score=headline, details=details)
+
+
+def _metric(metric_type: str, tp: int, fp: int, fn: int, beta_sq: float) -> float:
+    """One formula switch covering precision / recall / F-beta."""
+    if metric_type == "precision":
+        return tp / (tp + fp) if (tp + fp) > 0 else 0.0
+    if metric_type == "recall":
+        return tp / (tp + fn) if (tp + fn) > 0 else 0.0
+    if metric_type == "fscore":
+        p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+        r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+        denom = beta_sq * p + r
+        return (1 + beta_sq) * p * r / denom if denom > 0 else 0.0
+    raise ValueError(
+        f"Unknown metric_type: {metric_type!r}. "
+        "Expected one of: precision, recall, fscore."
+    )
diff --git a/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
new file mode 100644
index 000000000..9cd895ad2
--- /dev/null
+++ b/packages/uipath/src/uipath/eval/evaluators/dataset_evaluator_factory.py
@@ -0,0 +1,27 @@
+"""Factory that instantiates dataset-level evaluators from aggregator specs.
+
+Dataset evaluators are built from a self-contained :class:`AggregatorSpec`
+embedded in a per-datapoint classification evaluator's config, plus the source
+evaluator's name (supplied by the runtime when walking those configs). All
+three aggregator types share a single :class:`ClassificationDatasetEvaluator`
+implementation that dispatches on ``spec.type`` internally.
+"""
+
+from __future__ import annotations
+
+from ._aggregator_specs import AggregatorSpec
+from .classification_dataset_evaluators import ClassificationDatasetEvaluator
+
+
+def build_dataset_evaluator(
+    spec: AggregatorSpec,
+    source_evaluator: str,
+) -> ClassificationDatasetEvaluator:
+    """Build a dataset evaluator instance from an aggregator spec.
+
+    Args:
+        spec: A validated :class:`AggregatorSpec` (precision / recall / fscore).
+        source_evaluator: Name of the per-datapoint evaluator whose results
+            this aggregator consumes.
+    """
+    return ClassificationDatasetEvaluator(spec, source_evaluator)
diff --git a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
index 69790c3aa..842d13174 100644
--- a/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
+++ b/packages/uipath/src/uipath/eval/evaluators/multiclass_classification_evaluator.py
@@ -20,6 +20,7 @@
     UiPathEvaluationError,
     UiPathEvaluationErrorCategory,
 )
+from ._aggregator_specs import AggregatorSpec
 from .base_evaluator import BaseEvaluationCriteria, BaseEvaluatorJustification
 from .output_evaluator import (
     BaseOutputEvaluator,
@@ -43,6 +44,12 @@ class MulticlassClassificationEvaluatorConfig(
     metric_type: Literal["precision", "recall", "f-score"] = "f-score"
     averaging: Literal["micro", "macro"] = "macro"
     f_value: float = 1.0
+    # Optional run-level aggregators (precision / recall / fscore). Each is a
+    # self-contained spec carrying its own ``classes``, ``averaging``, and
+    # (for fscore) ``f_value``. The dataset-evaluator runtime walks this list
+    # after all per-datapoint evaluators complete and emits one structured
+    # result per aggregator keyed by ``{evaluator_name}.{aggregator.type}``.
+    aggregators: list[AggregatorSpec] | None = None
 
 
 class MulticlassClassificationEvaluator(
diff --git a/packages/uipath/src/uipath/eval/runtime/_types.py b/packages/uipath/src/uipath/eval/runtime/_types.py
index 2aee5e599..fa84f0d9e 100644
--- a/packages/uipath/src/uipath/eval/runtime/_types.py
+++ b/packages/uipath/src/uipath/eval/runtime/_types.py
@@ -1,7 +1,7 @@
 import logging
 
 from opentelemetry.sdk.trace import ReadableSpan
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 from pydantic.alias_generators import to_camel
 
 from uipath.runtime import UiPathRuntimeResult
@@ -78,6 +78,9 @@ class UiPathEvalOutput(BaseModel):
 
     evaluation_set_name: str
     evaluation_set_results: list[UiPathEvalRunResult]
+    dataset_evaluator_results: dict[str, EvaluationResultDto] = Field(
+        default_factory=dict
+    )
 
     @property
     def score(self) -> float:
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
index 7f7614446..7167d7f20 100644
--- a/packages/uipath/src/uipath/eval/runtime/runtime.py
+++ b/packages/uipath/src/uipath/eval/runtime/runtime.py
@@ -45,7 +45,15 @@
 from uipath.runtime.schema import UiPathRuntimeSchema
 
 from .._execution_context import ExecutionSpanCollector
+from ..evaluators._aggregator_specs import AggregatorSpec, FScoreAggregatorSpec
 from ..evaluators.base_evaluator import GenericBaseEvaluator
+from ..evaluators.binary_classification_evaluator import (
+    BinaryClassificationEvaluatorConfig,
+)
+from ..evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from ..evaluators.multiclass_classification_evaluator import (
+    MulticlassClassificationEvaluatorConfig,
+)
 from ..evaluators.output_evaluator import OutputEvaluationCriteria
 from ..helpers import get_agent_model
 from ..mocks._cache_manager import CacheManager
@@ -202,6 +210,97 @@ def compute_evaluator_scores(
     return final_score, agg_metrics_per_evaluator
 
 
+def compute_dataset_evaluator_results(
+    evaluation_set_results: list[UiPathEvalRunResult],
+    evaluators: Iterable[GenericBaseEvaluator[Any, Any, Any]],
+) -> dict[str, EvaluationResultDto]:
+    """Run any dataset-level aggregators embedded in per-datapoint evaluator configs.
+
+    Walks ``evaluators`` looking for any whose config carries an ``aggregators``
+    list (currently only Binary/Multiclass classification). For each aggregator
+    spec, builds the corresponding dataset evaluator via the factory and runs it
+    over the per-datapoint results that came from that source evaluator.
+
+    Args:
+        evaluation_set_results: Per-datapoint results from the run.
+        evaluators: Per-datapoint evaluator instances that ran during this eval
+            set. Their configs may carry ``aggregators`` lists.
+
+    Returns:
+        Dict mapping ``"{evaluator_name}.{aggregator_type}"`` to the run-level
+        EvaluationResultDto. When the same aggregator ``type`` appears more
+        than once on a source (e.g. macro+micro precision), each variant is
+        disambiguated as ``"{evaluator_name}.{type}.{averaging}"`` and, for
+        fscore, with the ``f_value`` suffix (``"...fbN"``), so a duplicate
+        type never overwrites a previous result. Aggregators whose source
+        produced no results are still invoked with an empty list so they emit
+        a zeroed result.
+    """
+    results_by_evaluator: defaultdict[str, list[EvaluationResultDto]] = defaultdict(
+        list
+    )
+    for eval_run_result in evaluation_set_results:
+        for eval_run_result_dto in eval_run_result.evaluation_run_results:
+            if eval_run_result_dto.is_line_result:
+                continue
+            results_by_evaluator[eval_run_result_dto.evaluator_name].append(
+                eval_run_result_dto.result
+            )
+
+    dataset_results: dict[str, EvaluationResultDto] = {}
+    for evaluator in evaluators:
+        # Aggregators currently only live on classification evaluator configs.
+        # ``GenericBaseEvaluator`` doesn't declare ``evaluator_config``, so we
+        # retrieve it via ``getattr`` and narrow with ``isinstance`` to a
+        # classification config type before reading ``aggregators``. Widen the
+        # tuple if a future evaluator type grows an ``aggregators`` field.
+        config = getattr(evaluator, "evaluator_config", None)
+        if not isinstance(
+            config,
+            (
+                BinaryClassificationEvaluatorConfig,
+                MulticlassClassificationEvaluatorConfig,
+            ),
+        ):
+            continue
+        if not config.aggregators:
+            continue
+        source_name = config.name
+        source_results = results_by_evaluator.get(source_name, [])
+        # Count occurrences of each aggregator type to detect duplicates
+        # (e.g. macro+micro precision on the same source). The default key
+        # shape ``{source}.{type}`` collides on duplicates; disambiguate with
+        # ``.{averaging}`` (and ``.fb{f_value}`` for fscore variants) only
+        # when more than one aggregator of that type exists, to preserve the
+        # simple key shape in the common case.
+        type_counts: dict[str, int] = defaultdict(int)
+        for spec in config.aggregators:
+            type_counts[spec.type] += 1
+        for spec in config.aggregators:
+            dataset_evaluator = build_dataset_evaluator(spec, source_name)
+            key = _dataset_result_key(source_name, spec, type_counts[spec.type] > 1)
+            dataset_results[key] = EvaluationResultDto.from_evaluation_result(
+                dataset_evaluator.evaluate(source_results)
+            )
+    return dataset_results
+
+
+def _dataset_result_key(
+    source_name: str, spec: AggregatorSpec, disambiguate: bool
+) -> str:
+    """Build the result-dict key for a dataset evaluator.
+
+    Uses ``{source}.{type}`` for unique-type aggregators, and appends
+    ``.{averaging}`` (plus ``.fb{f_value}`` for fscore) when the same type
+    appears more than once on the same source.
+    """
+    if not disambiguate:
+        return f"{source_name}.{spec.type}"
+    if isinstance(spec, FScoreAggregatorSpec):
+        return f"{source_name}.{spec.type}.{spec.averaging}.fb{spec.f_value}"
+    return f"{source_name}.{spec.type}.{spec.averaging}"
+
+
 class UiPathEvalRuntime:
     """Specialized runtime for evaluation runs, with access to the factory."""
 
@@ -381,6 +480,19 @@ async def execute(self) -> UiPathRuntimeResult:
                         evaluators,
                     )
 
+                    # Run any dataset-level aggregators embedded in per-datapoint
+                    # classification evaluator configs (the ``aggregators`` list).
+                    # Each aggregator consumes per-datapoint results from its
+                    # parent evaluator and emits one run-level EvaluationResultDto
+                    # keyed ``{evaluator_name}.{aggregator_type}`` on
+                    # UiPathEvalOutput.dataset_evaluator_results.
+                    results.dataset_evaluator_results = (
+                        compute_dataset_evaluator_results(
+                            results.evaluation_set_results,
+                            evaluators,
+                        )
+                    )
+
                     # Configure span with output and metadata
                     await configure_eval_set_run_span(
                         span=span,
diff --git a/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
new file mode 100644
index 000000000..e04a13fb0
--- /dev/null
+++ b/packages/uipath/tests/evaluators/test_dataset_classification_evaluators.py
@@ -0,0 +1,560 @@
+"""Tests for dataset-level classification evaluators (Precision, Recall, FScore).
+
+Covers the math (2-class, 3-class, micro vs macro, F-beta), edge cases
+(empty input, out-of-vocab labels, malformed details), factory dispatch, and
+runtime-level routing where compute_dataset_evaluator_results walks
+per-datapoint evaluator configs' embedded ``aggregators`` lists.
+"""
+
+import uuid
+
+import pytest
+from pydantic import BaseModel
+
+from uipath.eval.evaluators._aggregator_specs import (
+    FScoreAggregatorSpec,
+    PrecisionAggregatorSpec,
+    RecallAggregatorSpec,
+)
+from uipath.eval.evaluators.base_evaluator import BaseEvaluatorJustification
+from uipath.eval.evaluators.classification_dataset_evaluators import (
+    ClassificationDatasetEvaluator,
+    ClassificationDetails,
+)
+from uipath.eval.evaluators.dataset_evaluator_factory import build_dataset_evaluator
+from uipath.eval.evaluators.multiclass_classification_evaluator import (
+    MulticlassClassificationEvaluator,
+)
+from uipath.eval.models.models import (
+    EvaluationResultDto,
+    NumericEvaluationResult,
+)
+from uipath.eval.runtime._types import (
+    UiPathEvalRunResult,
+    UiPathEvalRunResultDto,
+)
+from uipath.eval.runtime.runtime import compute_dataset_evaluator_results
+
+
+def _result(
+    expected: str, actual: str, score: float | None = None
+) -> EvaluationResultDto:
+    """Build an EvaluationResultDto carrying an expected/actual justification."""
+    if score is None:
+        score = 1.0 if expected.lower() == actual.lower() else 0.0
+    justification = BaseEvaluatorJustification(expected=expected, actual=actual)
+    return EvaluationResultDto(
+        score=score,
+        details=justification.model_dump(),
+    )
+
+
+def _precision(
+    classes: list[str], averaging: str = "macro"
+) -> ClassificationDatasetEvaluator:
+    spec = PrecisionAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
+
+
+def _recall(
+    classes: list[str], averaging: str = "macro"
+) -> ClassificationDatasetEvaluator:
+    spec = RecallAggregatorSpec(classes=classes, averaging=averaging)  # type: ignore[arg-type]
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
+
+
+def _fscore(
+    classes: list[str], averaging: str = "macro", f_value: float = 1.0
+) -> ClassificationDatasetEvaluator:
+    spec = FScoreAggregatorSpec(
+        classes=classes,
+        averaging=averaging,  # type: ignore[arg-type]
+        f_value=f_value,
+    )
+    return ClassificationDatasetEvaluator(spec, source_evaluator="intent_match")
+
+
+def _details(result: object) -> ClassificationDetails:
+    """Type-narrowing helper for asserting on details."""
+    assert isinstance(result, NumericEvaluationResult)
+    assert isinstance(result.details, ClassificationDetails)
+    return result.details
+
+
+def _multiclass_evaluator(
+    name: str,
+    classes: list[str],
+    aggregators: list[BaseModel],
+) -> MulticlassClassificationEvaluator:
+    """Build a per-datapoint multiclass evaluator with embedded aggregators."""
+    return MulticlassClassificationEvaluator.model_validate(
+        {
+            "id": str(uuid.uuid4()),
+            "evaluatorConfig": {
+                "name": name,
+                "classes": classes,
+                "aggregators": [spec.model_dump(by_alias=True) for spec in aggregators],
+            },
+        }
+    )
+
+
+class TestPrecisionEvaluator:
+    def test_empty_input_returns_zeroed_result(self) -> None:
+        result = _precision(["cat", "dog"]).evaluate([])
+        assert isinstance(result, NumericEvaluationResult)
+        assert result.score == 0.0
+        d = _details(result)
+        assert d.n_total == 0 and d.n_scored == 0
+        assert d.confusion_matrix == [[0, 0], [0, 0]]
+        assert d.per_class["cat"].tp == 0
+        assert d.per_class["cat"].tn == 0
+
+    def test_confusion_matrix_is_predicted_by_expected(self) -> None:
+        # Pin the documented orientation: confusion_matrix[predicted][expected].
+        # Differs from sklearn's [true][predicted] convention.
+        results = [
+            _result("cat", "cat"),  # expected=cat, predicted=cat -> [cat][cat]
+            _result("cat", "dog"),  # expected=cat, predicted=dog -> [dog][cat]
+            _result("dog", "dog"),  # expected=dog, predicted=dog -> [dog][dog]
+            _result("dog", "dog"),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        # classes -> index: cat=0, dog=1
+        # [predicted=cat][expected=cat] = 1
+        assert d.confusion_matrix[0][0] == 1
+        # [predicted=dog][expected=cat] = 1 (the FP for dog / FN for cat)
+        assert d.confusion_matrix[1][0] == 1
+        # [predicted=dog][expected=dog] = 2
+        assert d.confusion_matrix[1][1] == 2
+        # [predicted=cat][expected=dog] = 0
+        assert d.confusion_matrix[0][1] == 0
+
+    def test_precision_two_class_macro(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _precision(["yes", "no"], averaging="macro").evaluate(results)
+        d = _details(result)
+        # precision_yes = 2 / (2 + 1) = 2/3
+        # precision_no  = 0 / (0 + 1) = 0
+        # macro = (2/3 + 0) / 2 = 1/3
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert d.macro == pytest.approx((2 / 3 + 0.0) / 2)
+        assert result.score == pytest.approx(d.macro)
+
+    def test_two_class_micro_equals_accuracy(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _precision(["yes", "no"], averaging="micro").evaluate(results)
+        d = _details(result)
+        assert d.micro == pytest.approx(0.5)
+        assert result.score == pytest.approx(0.5)
+
+    def test_three_class_macro(self) -> None:
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),
+        ]
+        result = _precision(["cat", "dog", "bird"], averaging="macro").evaluate(
+            [_result(e, a) for e, a in pairs]
+        )
+        d = _details(result)
+        for label in ("cat", "dog", "bird"):
+            m = d.per_class[label]
+            assert m.tp == 2 and m.fp == 1 and m.fn == 1 and m.tn == 5
+            assert m.value == pytest.approx(2 / 3)
+        assert d.macro == pytest.approx(2 / 3)
+        assert result.score == pytest.approx(2 / 3)
+
+
+class TestRecallEvaluator:
+    def test_recall_two_class_macro(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        result = _recall(["yes", "no"], averaging="macro").evaluate(results)
+        d = _details(result)
+        assert d.per_class["yes"].value == pytest.approx(2 / 3)
+        assert d.per_class["no"].value == pytest.approx(0.0)
+        assert result.score == pytest.approx(1 / 3)
+
+    def test_recall_differs_from_precision(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
+        ]
+        p = _details(_precision(["yes", "no"], averaging="macro").evaluate(results))
+        r = _details(_recall(["yes", "no"], averaging="macro").evaluate(results))
+        assert p.per_class["yes"].value == pytest.approx(0.5)
+        assert p.per_class["no"].value == pytest.approx(1.0)
+        assert r.per_class["yes"].value == pytest.approx(1.0)
+        assert r.per_class["no"].value == pytest.approx(1 / 3)
+
+
+class TestFScoreEvaluator:
+    def test_f1_equals_harmonic_mean_of_p_and_r(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("yes", "no"),
+            _result("no", "yes"),
+        ]
+        f = _details(
+            _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results)
+        )
+        assert f.per_class["yes"].value == pytest.approx(2 / 3)
+        assert f.per_class["no"].value == pytest.approx(0.0)
+        assert f.macro == pytest.approx((2 / 3 + 0.0) / 2)
+
+    def test_f_beta_emphasizes_recall_when_beta_above_one(self) -> None:
+        results = [
+            _result("yes", "yes"),
+            _result("yes", "yes"),
+            _result("no", "yes"),
+            _result("no", "yes"),
+            _result("no", "no"),
+        ]
+        f1 = _details(
+            _fscore(["yes", "no"], averaging="macro", f_value=1.0).evaluate(results)
+        )
+        f2 = _details(
+            _fscore(["yes", "no"], averaging="macro", f_value=2.0).evaluate(results)
+        )
+        assert f2.per_class["yes"].value > f1.per_class["yes"].value
+
+    def test_three_class_micro_pools_across_classes(self) -> None:
+        pairs = [
+            ("cat", "cat"),
+            ("cat", "cat"),
+            ("cat", "dog"),
+            ("dog", "dog"),
+            ("dog", "dog"),
+            ("dog", "bird"),
+            ("bird", "bird"),
+            ("bird", "bird"),
+            ("bird", "cat"),
+        ]
+        d = _details(
+            _fscore(["cat", "dog", "bird"], averaging="micro", f_value=1.0).evaluate(
+                [_result(e, a) for e, a in pairs]
+            )
+        )
+        assert d.micro == pytest.approx(6 / 9)
+
+
+class TestSkippingAndEdgeCases:
+    def test_out_of_vocab_labels_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            _result("cat", "platypus"),
+            _result("zebra", "dog"),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_results_without_justification_are_skipped(self) -> None:
+        results = [
+            _result("cat", "cat"),
+            EvaluationResultDto(score=1.0, details="just a string"),
+            EvaluationResultDto(score=0.0, details={"unrelated": "shape"}),
+        ]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.n_total == 3 and d.n_scored == 1 and d.n_skipped == 2
+
+    def test_case_insensitive(self) -> None:
+        results = [_result("Cat", "CAT"), _result("DOG", "dog")]
+        d = _details(_precision(["cat", "dog"]).evaluate(results))
+        assert d.per_class["cat"].tp == 1
+        assert d.per_class["dog"].tp == 1
+
+
+class TestFactory:
+    """The factory now takes an AggregatorSpec instance + source name, not a dict."""
+
+    def test_builds_precision_from_spec(self) -> None:
+        spec = PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro")
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert evaluator.spec.type == "precision"
+        assert evaluator.source_evaluator == "intent_match"
+        assert evaluator.name == "intent_match.precision"
+
+    def test_builds_recall_from_spec(self) -> None:
+        spec = RecallAggregatorSpec(classes=["yes", "no"], averaging="micro")
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert evaluator.spec.type == "recall"
+        assert evaluator.name == "intent_match.recall"
+
+    def test_builds_fscore_from_spec(self) -> None:
+        spec = FScoreAggregatorSpec(
+            classes=["yes", "no"], averaging="macro", f_value=2.0
+        )
+        evaluator = build_dataset_evaluator(spec, "intent_match")
+        assert isinstance(evaluator, ClassificationDatasetEvaluator)
+        assert isinstance(evaluator.spec, FScoreAggregatorSpec)
+        assert evaluator.spec.f_value == 2.0
+
+
+class TestAggregatorSpecJsonRoundTrip:
+    """Pin the wire shape sent to the C# side."""
+
+    def test_precision_uses_self_contained_fields(self) -> None:
+        spec = PrecisionAggregatorSpec.model_validate(
+            {
+                "type": "precision",
+                "classes": ["book", "cancel", "reschedule"],
+                "averaging": "macro",
+            }
+        )
+        dumped = spec.model_dump(by_alias=True)
+        assert dumped == {
+            "type": "precision",
+            "classes": ["book", "cancel", "reschedule"],
+            "averaging": "macro",
+        }
+
+    def test_fscore_uses_camelcase_fvalue_on_wire(self) -> None:
+        spec = FScoreAggregatorSpec.model_validate(
+            {
+                "type": "fscore",
+                "classes": ["yes", "no"],
+                "averaging": "macro",
+                "fValue": 1.5,
+            }
+        )
+        assert spec.f_value == 1.5
+        dumped = spec.model_dump(by_alias=True)
+        assert dumped["fValue"] == 1.5
+        assert "f_value" not in dumped
+
+    def test_multiclass_evaluator_round_trips_aggregators(self) -> None:
+        """Per-datapoint evaluator config carries aggregators[]; survives dump+load."""
+        ev = _multiclass_evaluator(
+            "intent_classifier",
+            classes=["book", "cancel", "reschedule"],
+            aggregators=[
+                PrecisionAggregatorSpec(
+                    classes=["book", "cancel", "reschedule"], averaging="macro"
+                ),
+                FScoreAggregatorSpec(
+                    classes=["book", "cancel", "reschedule"],
+                    averaging="macro",
+                    f_value=1.0,
+                ),
+            ],
+        )
+        assert ev.evaluator_config.aggregators is not None
+        assert len(ev.evaluator_config.aggregators) == 2
+        assert ev.evaluator_config.aggregators[0].type == "precision"
+        assert ev.evaluator_config.aggregators[1].type == "fscore"
+
+
+class TestComputeDatasetEvaluatorResults:
+    """End-to-end: runtime walks evaluator configs' aggregators[]."""
+
+    def test_walks_aggregators_on_classification_evaluator(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+                RecallAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
+
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=0.5),
+                    ),
+                ],
+            ),
+            UiPathEvalRunResult(
+                evaluation_name="dp2",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "no"),
+                    ),
+                ],
+            ),
+        ]
+
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        # Two aggregators on intent_match → two keys, prefixed by source name.
+        assert set(out) == {"intent_match.precision", "intent_match.recall"}
+        precision_dto = out["intent_match.precision"]
+        assert isinstance(precision_dto, EvaluationResultDto)
+        assert isinstance(precision_dto.details, dict)
+        # The unrelated 0.5 score from some_other_evaluator must NOT be in the matrix.
+        assert precision_dto.details["n_scored"] == 2
+
+    def test_evaluator_without_aggregators_is_skipped(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match", classes=["yes", "no"], aggregators=[]
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert out == {}
+
+    def test_line_by_line_subresults_are_excluded(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                        is_line_result=True,
+                    ),
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("no", "no"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert isinstance(out["intent_match.precision"].details, dict)
+        assert out["intent_match.precision"].details["n_scored"] == 1
+
+    def test_source_with_no_results_produces_zeroed_report(self) -> None:
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="some_other_evaluator",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=EvaluationResultDto(score=1.0),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        dto = out["intent_match.precision"]
+        assert dto.score == 0.0
+        assert isinstance(dto.details, dict)
+        assert dto.details["n_scored"] == 0
+
+    def test_duplicate_aggregator_type_disambiguates_by_averaging(self) -> None:
+        """Two aggregators of the same type get distinct keys (no overwrite)."""
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="macro"),
+                PrecisionAggregatorSpec(classes=["yes", "no"], averaging="micro"),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        # Same type appears twice → averaging suffix disambiguates so neither
+        # is silently overwritten.
+        assert set(out) == {
+            "intent_match.precision.macro",
+            "intent_match.precision.micro",
+        }
+
+    def test_duplicate_fscore_disambiguates_by_averaging_and_fvalue(self) -> None:
+        """Two FScore aggregators (e.g. F1 macro and F2 macro) both survive."""
+        evaluator = _multiclass_evaluator(
+            "intent_match",
+            classes=["yes", "no"],
+            aggregators=[
+                FScoreAggregatorSpec(
+                    classes=["yes", "no"], averaging="macro", f_value=1.0
+                ),
+                FScoreAggregatorSpec(
+                    classes=["yes", "no"], averaging="macro", f_value=2.0
+                ),
+            ],
+        )
+        eval_results = [
+            UiPathEvalRunResult(
+                evaluation_name="dp1",
+                evaluation_run_results=[
+                    UiPathEvalRunResultDto(
+                        evaluator_name="intent_match",
+                        evaluator_id=str(uuid.uuid4()),
+                        result=_result("yes", "yes"),
+                    ),
+                ],
+            ),
+        ]
+        out = compute_dataset_evaluator_results(eval_results, [evaluator])
+        assert set(out) == {
+            "intent_match.fscore.macro.fb1.0",
+            "intent_match.fscore.macro.fb2.0",
+        }