diff --git a/common/polystar/common/image_pipeline/preprocessors/normalise.py b/common/polystar/common/image_pipeline/preprocessors/normalise.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00c8d0d31d6445b476e88ad5fea20eb7bf09e3d
--- /dev/null
+++ b/common/polystar/common/image_pipeline/preprocessors/normalise.py
@@ -0,0 +1,7 @@
+from polystar.common.models.image import Image
+from polystar.common.pipeline.pipe_abc import PipeABC
+
+
+class Normalise(PipeABC):
+    def transform_single(self, image: Image) -> Image:
+        return image / 255
diff --git a/common/polystar/common/image_pipeline/preprocessors/resize.py b/common/polystar/common/image_pipeline/preprocessors/resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..6afbc2b112b787659de43c8575d61e806d969ae7
--- /dev/null
+++ b/common/polystar/common/image_pipeline/preprocessors/resize.py
@@ -0,0 +1,14 @@
+from typing import Tuple
+
+from cv2.cv2 import resize
+
+from polystar.common.models.image import Image
+from polystar.common.pipeline.pipe_abc import PipeABC
+
+
+class Resize(PipeABC):
+    def __init__(self, size: Tuple[int, int]):
+        self.size = size
+
+    def transform_single(self, image: Image) -> Image:
+        return resize(image, self.size)
diff --git a/common/polystar/common/models/image.py b/common/polystar/common/models/image.py
index 4d598f562cd21be2b1cbb8050e2c9860a5df1447..29a0b13b3f7af5e2d689098932e252af79f446a4 100644
--- a/common/polystar/common/models/image.py
+++ b/common/polystar/common/models/image.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, List
 
 import cv2
 import numpy as np
@@ -38,3 +38,7 @@ def load_images_in_directory(
 def save_image(image: Image, image_path: Path, conversion: int = cv2.COLOR_RGB2BGR):
     image_path.parent.mkdir(exist_ok=True, parents=True)
     cv2.imwrite(str(image_path), cv2.cvtColor(image, conversion))
+
+
+def file_images_to_images(file_images: Iterable[FileImage]) -> List[Image]:
+    return [np.asarray(file_image) for file_image in file_images]
diff --git a/common/polystar/common/pipeline/classification/classification_pipeline.py b/common/polystar/common/pipeline/classification/classification_pipeline.py
index 074cc15e5942abe0593f547939c64747224b1ba5..99c85f5000881185af3362c2856a856f06fcee71 100644
--- a/common/polystar/common/pipeline/classification/classification_pipeline.py
+++ b/common/polystar/common/pipeline/classification/classification_pipeline.py
@@ -2,7 +2,7 @@ from abc import ABC
 from enum import IntEnum
 from typing import ClassVar, Generic, List, Sequence, Tuple, TypeVar
 
-from numpy import asarray, ndarray
+from numpy import asarray, ndarray, pad
 
 from polystar.common.pipeline.classification.classifier_abc import ClassifierABC
 from polystar.common.pipeline.pipe_abc import IT, PipeABC
@@ -29,6 +29,13 @@ class ClassificationPipeline(Pipeline, Generic[IT, EnumT], ABC):
     def predict(self, x: Sequence[IT]) -> List[EnumT]:
         return self.predict_proba_and_classes(x)[1]
 
+    def predict_proba(self, x: Sequence[IT]) -> ndarray:
+        proba = super().predict_proba(x)
+        missing_classes = self.classifier.n_classes - proba.shape[1]
+        if not missing_classes:
+            return proba
+        return pad(proba, ((0, 0), (0, missing_classes)))
+
     def predict_proba_and_classes(self, x: Sequence[IT]) -> Tuple[ndarray, List[EnumT]]:
         proba = asarray(self.predict_proba(x))
         indices = proba.argmax(axis=1)
diff --git a/common/polystar/common/utils/iterable_utils.py b/common/polystar/common/utils/iterable_utils.py
index a0046880057618448b223fab0f64ab231619546a..01bc2da41ef0b4ba3894a14cdf989c745c6829fe 100644
--- a/common/polystar/common/utils/iterable_utils.py
+++ b/common/polystar/common/utils/iterable_utils.py
@@ -1,4 +1,6 @@
-from typing import Iterable
+from collections import defaultdict
+from itertools import chain
+from typing import Callable, Dict, Iterable, List, TypeVar
 
 from more_itertools import ilen
 
@@ -8,3 +10,20 @@ def smart_len(it: Iterable) -> int:
         return len(it)
     except AttributeError:
         return ilen(it)
+
+
+T = TypeVar("T")
+
+
+def flatten(it: Iterable[Iterable[T]]) -> List[T]:
+    return list(chain.from_iterable(it))
+
+
+U = TypeVar("U")
+
+
+def group_by(it: Iterable[T], key: Callable[[T], U]) -> Dict[U, List[T]]:
+    rv = defaultdict(list)
+    for item in it:
+        rv[key(item)].append(item)
+    return rv
diff --git a/common/polystar/common/utils/markdown.py b/common/polystar/common/utils/markdown.py
index 79a9d8360d4fd0e6d6c0ff444c4246a7bcafda04..3997375130872f443774927e0585f9e2d76fc552 100644
--- a/common/polystar/common/utils/markdown.py
+++ b/common/polystar/common/utils/markdown.py
@@ -1,6 +1,7 @@
 from pathlib import Path
-from typing import TextIO, Iterable, Any
+from typing import Any, Iterable, TextIO
 
+from matplotlib.figure import Figure
 from pandas import DataFrame
 from tabulate import tabulate
 
@@ -35,7 +36,11 @@ class MarkdownFile:
         self.paragraph(f"![{alt}]({relative_path})")
         return self
 
+    def figure(self, figure: Figure, name: str, alt: str = "img"):
+        figure.savefig(self.markdown_path.parent / name)
+        return self.image(name, alt)
+
     def table(self, data: DataFrame) -> "MarkdownFile":
-        self.file.write(tabulate(data, tablefmt="pipe", headers="keys"))
+        self.file.write(tabulate(data, tablefmt="pipe", headers="keys").replace(".0 ", "   "))
         self.file.write("\n\n")
         return self
diff --git a/common/research/common/datasets/image_dataset.py b/common/research/common/datasets/image_dataset.py
index 13bb5a584cb817f7bcb53da811b70f2e10b3e8d8..9378439106a6d241143240205dbb9674ca716991 100644
--- a/common/research/common/datasets/image_dataset.py
+++ b/common/research/common/datasets/image_dataset.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from polystar.common.models.image import Image
+from polystar.common.models.image import FileImage, Image
 from research.common.datasets.dataset import Dataset
 from research.common.datasets.lazy_dataset import LazyDataset, TargetT
 
@@ -9,3 +9,6 @@ FileDataset = Dataset[Path, TargetT]
 
 LazyImageDataset = LazyDataset[Image, TargetT]
 ImageDataset = Dataset[Image, TargetT]
+
+LazyFileImageDataset = LazyDataset[FileImage, TargetT]
+FileImageDataset = Dataset[FileImage, TargetT]
diff --git a/dataset/dji_roco/robomaster_Final Tournament/digits/.changes b/dataset/dji_roco/robomaster_Final Tournament/digits/.changes
index 35d75bb3ff7e3bcf5ffc930deef3313e092ba0fe..094f578582ac0df9123e392b0be3aec31d5a695b 100644
Binary files a/dataset/dji_roco/robomaster_Final Tournament/digits/.changes and b/dataset/dji_roco/robomaster_Final Tournament/digits/.changes differ
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a01bf0de068690b9dfb1582a668874aa95739365
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
@@ -0,0 +1,20 @@
+from typing import List
+
+from polystar.common.models.object import ArmorColor
+from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
+from research.robots_at_robots.armor_color.armor_color_dataset import make_armor_color_dataset_generator
+from research.robots_at_robots.evaluation.benchmark import make_armor_value_benchmarker
+
+
+def make_armor_color_benchmarker(
+    train_roco_datasets: List[ROCODatasetBuilder], test_roco_datasets: List[ROCODatasetBuilder], experiment_name: str
+):
+    dataset_generator = make_armor_color_dataset_generator()
+    return make_armor_value_benchmarker(
+        train_roco_datasets=train_roco_datasets,
+        test_roco_datasets=test_roco_datasets,
+        evaluation_project="armor-color",
+        experiment_name=experiment_name,
+        classes=list(ArmorColor),
+        dataset_generator=dataset_generator,
+    )
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py
deleted file mode 100644
index a24ad1a25b129f19b44c2c5cb2f6ec1af498550d..0000000000000000000000000000000000000000
--- a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import List
-
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.robots_at_robots.armor_color.armor_color_dataset import make_armor_color_dataset_generator
-from research.robots_at_robots.evaluation.image_pipeline_evaluation_reporter import ImagePipelineEvaluationReporter
-from research.robots_at_robots.evaluation.image_pipeline_evaluator import ImagePipelineEvaluator
-
-
-class ArmorColorPipelineReporterFactory:
-    @staticmethod
-    def from_roco_datasets(
-        train_roco_datasets: List[ROCODatasetBuilder],
-        test_roco_datasets: List[ROCODatasetBuilder],
-        experiment_name: str,
-    ):
-        return ImagePipelineEvaluationReporter(
-            evaluator=ImagePipelineEvaluator(
-                train_roco_datasets=train_roco_datasets,
-                test_roco_datasets=test_roco_datasets,
-                image_dataset_generator=make_armor_color_dataset_generator(),
-            ),
-            evaluation_project="armor-color",
-            experiment_name=experiment_name,
-        )
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/baseline_experiments.py b/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
similarity index 85%
rename from robots-at-robots/research/robots_at_robots/armor_color/baseline_experiments.py
rename to robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
index 703fefe2757b2afbf2dd9e7cffc620e0b139824e..1ac6f2b9ce966d91659b005f6206fa35f80c7647 100644
--- a/robots-at-robots/research/robots_at_robots/armor_color/baseline_experiments.py
+++ b/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
@@ -13,9 +13,7 @@ from polystar.common.pipeline.classification.random_model import RandomClassifie
 from polystar.common.pipeline.classification.rule_based_classifier import RuleBasedClassifierABC
 from polystar.common.pipeline.pipe_abc import PipeABC
 from research.common.datasets.roco.zoo.roco_dataset_zoo import ROCODatasetsZoo
-from research.robots_at_robots.armor_color.armor_color_pipeline_reporter_factory import (
-    ArmorColorPipelineReporterFactory,
-)
+from research.robots_at_robots.armor_color.armor_color_benchmarker import make_armor_color_benchmarker
 
 
 class ArmorColorPipeline(ClassificationPipeline):
@@ -38,20 +36,20 @@ class RedBlueComparisonClassifier(RuleBasedClassifierABC):
 if __name__ == "__main__":
     logging.getLogger().setLevel("INFO")
 
-    reporter = ArmorColorPipelineReporterFactory.from_roco_datasets(
-        train_roco_datasets=[
+    _benchmarker = make_armor_color_benchmarker(
+        [
             ROCODatasetsZoo.TWITCH.T470150052,
             ROCODatasetsZoo.TWITCH.T470152289,
             ROCODatasetsZoo.TWITCH.T470149568,
             ROCODatasetsZoo.TWITCH.T470151286,
         ],
-        test_roco_datasets=[
+        [
             ROCODatasetsZoo.TWITCH.T470152838,
             ROCODatasetsZoo.TWITCH.T470153081,
             ROCODatasetsZoo.TWITCH.T470158483,
             ROCODatasetsZoo.TWITCH.T470152730,
         ],
-        experiment_name="test",
+        "test",
     )
 
     red_blue_comparison_pipeline = ArmorColorPipeline.from_pipes(
@@ -62,4 +60,4 @@ if __name__ == "__main__":
         [RGB2HSV(), Histogram2D(), LogisticRegression()], name="hsv-hist-lr",
     )
 
-    reporter.report([random_pipeline, red_blue_comparison_pipeline, hsv_hist_lr_pipeline])
+    _benchmarker.benchmark([random_pipeline, red_blue_comparison_pipeline, hsv_hist_lr_pipeline])
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4792c43adcec2192b74a2457662cae09d028681
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
@@ -0,0 +1,20 @@
+from typing import List
+
+from polystar.common.models.object import ArmorDigit
+from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
+from research.robots_at_robots.armor_digit.armor_digit_dataset import make_armor_digit_dataset_generator
+from research.robots_at_robots.evaluation.benchmark import make_armor_value_benchmarker
+
+
+def make_armor_digit_benchmarker(
+    train_roco_datasets: List[ROCODatasetBuilder], test_roco_datasets: List[ROCODatasetBuilder], experiment_name: str
+):
+    dataset_generator = make_armor_digit_dataset_generator()
+    return make_armor_value_benchmarker(
+        train_roco_datasets=train_roco_datasets,
+        test_roco_datasets=test_roco_datasets,
+        evaluation_project="armor-digit",
+        experiment_name=experiment_name,
+        classes=list(ArmorDigit),
+        dataset_generator=dataset_generator,
+    )
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py
deleted file mode 100644
index 6c5f9a02c995bc24bfaa7399a148136740e68dcc..0000000000000000000000000000000000000000
--- a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import List
-
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.robots_at_robots.armor_digit.armor_digit_dataset import make_armor_digit_dataset_generator
-from research.robots_at_robots.evaluation.image_pipeline_evaluation_reporter import ImagePipelineEvaluationReporter
-from research.robots_at_robots.evaluation.image_pipeline_evaluator import ImagePipelineEvaluator
-
-
-class ArmorDigitPipelineReporterFactory:
-    @staticmethod
-    def from_roco_datasets(
-        train_roco_datasets: List[ROCODatasetBuilder],
-        test_roco_datasets: List[ROCODatasetBuilder],
-        experiment_name: str,
-    ):
-        return ImagePipelineEvaluationReporter(
-            evaluator=ImagePipelineEvaluator(
-                train_roco_datasets=train_roco_datasets,
-                test_roco_datasets=test_roco_datasets,
-                image_dataset_generator=make_armor_digit_dataset_generator(),
-            ),
-            evaluation_project="armor-digit",
-            experiment_name=experiment_name,
-        )
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py b/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
index 757247e1860a957a00565137351d10a0fc069749..1b48d0ea6c6c17c276ad28d3e1687a2446f42a51 100644
--- a/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
@@ -3,8 +3,6 @@ import warnings
 from pathlib import Path
 from typing import List, Sequence, Tuple
 
-import seaborn as sns
-from cv2.cv2 import resize
 from keras_preprocessing.image import ImageDataGenerator
 from numpy import asarray
 from tensorflow_core.python.keras import Input, Model, Sequential
@@ -15,16 +13,15 @@ from tensorflow_core.python.keras.optimizer_v2.adam import Adam
 from tensorflow_core.python.keras.optimizer_v2.gradient_descent import SGD
 from tensorflow_core.python.keras.utils.np_utils import to_categorical
 
+from polystar.common.image_pipeline.preprocessors.normalise import Normalise
+from polystar.common.image_pipeline.preprocessors.resize import Resize
 from polystar.common.models.image import Image
 from polystar.common.models.object import ArmorDigit
 from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
 from polystar.common.pipeline.classification.classifier_abc import ClassifierABC
 from polystar.common.pipeline.classification.random_model import RandomClassifier
-from polystar.common.pipeline.pipe_abc import PipeABC
 from research.common.datasets.roco.zoo.roco_dataset_zoo import ROCODatasetsZoo
-from research.robots_at_robots.armor_digit.armor_digit_pipeline_reporter_factory import (
-    ArmorDigitPipelineReporterFactory,
-)
+from research.robots_at_robots.armor_digit.armor_digit_benchmarker import make_armor_digit_benchmarker
 
 
 class ArmorDigitPipeline(ClassificationPipeline):
@@ -45,14 +42,14 @@ class KerasClassifier(ClassifierABC):
         return ImageDataGenerator(rotation_range=45, zoom_range=[0.8, 1])  # brightness_range=[0.7, 1.4]
 
     def fit(self, images: List[Image], labels: List[int]) -> "KerasClassifier":
-        n_val: int = 540  # FIXME
+        n_val: int = 371  # FIXME
         images = asarray(images)
         labels = to_categorical(asarray(labels), 5)  # FIXME
         train_images, train_labels = images[:-n_val], labels[:-n_val]
         val_images, val_labels = images[-n_val:], labels[-n_val:]
 
         batch_size = 32  # FIXME
-        train_generator = self.train_data_gen.flow(train_images, train_labels, batch_size)
+        train_generator = self.train_data_gen.flow(train_images, train_labels, batch_size=batch_size, shuffle=True)
 
         self.model.fit(
             x=train_generator,
@@ -100,19 +97,6 @@ class CNN(KerasClassifier):
         )
 
 
-class Resize(PipeABC):
-    def __init__(self, size: Tuple[int, int]):
-        self.size = size
-
-    def transform_single(self, image: Image) -> Image:
-        return resize(image, self.size)
-
-
-class Normalise(PipeABC):
-    def transform_single(self, image: Image) -> Image:
-        return image / 255
-
-
 def make_digits_cnn_pipeline(
     input_size: int, conv_blocks: Sequence[Sequence[int]], report_dir: Path, with_data_augmentation: bool, lr: float
 ) -> ArmorDigitPipeline:
@@ -186,9 +170,7 @@ if __name__ == "__main__":
     logging.getLogger("tensorflow").setLevel("ERROR")
     warnings.filterwarnings("ignore")
 
-    sns.set_style()
-
-    reporter = ArmorDigitPipelineReporterFactory.from_roco_datasets(
+    _benchmarker = make_armor_digit_benchmarker(
         train_roco_datasets=[
             # ROCODatasetsZoo.DJI.CENTRAL_CHINA,
             # ROCODatasetsZoo.DJI.FINAL,
@@ -200,32 +182,39 @@ if __name__ == "__main__":
             ROCODatasetsZoo.TWITCH.T470152289,
         ],
         test_roco_datasets=[
-            #
             ROCODatasetsZoo.TWITCH.T470152838,
             ROCODatasetsZoo.TWITCH.T470153081,
             ROCODatasetsZoo.TWITCH.T470158483,
             ROCODatasetsZoo.TWITCH.T470152730,
         ],
-        experiment_name="data_augm",
+        experiment_name="test-benchmarker",
     )
 
     random_pipeline = ArmorDigitPipeline.from_pipes([RandomClassifier()], name="random")
 
+    report_dir = _benchmarker.reporter.report_dir
     cnn_pipelines = [
-        make_digits_cnn_pipeline(32, ((32, 32), (64, 64)), reporter.report_dir, with_data_augmentation=True, lr=lr)
-        for lr in (1e-2, 5e-3, 2e-3, 1e-3, 5e-4, 2e-4)
-    ] + [
         make_digits_cnn_pipeline(
-            64, ((32,), (64, 64), (64, 64)), reporter.report_dir, with_data_augmentation=False, lr=lr
+            32, ((32, 32), (64, 64)), report_dir, with_data_augmentation=with_data_augmentation, lr=lr,
         )
-        for lr in (5e-2, 2e-2, 1e-2, 5e-3, 2e-3, 1e-3)
+        for with_data_augmentation in [False]
+        for lr in [2.5e-2, 1.6e-2, 1e-2, 6.3e-3, 4e-4]
     ]
+    # cnn_pipelines = [
+    #     make_digits_cnn_pipeline(
+    #         64, ((32,), (64, 64), (64, 64)), reporter.report_dir, with_data_augmentation=True, lr=lr
+    #     )
+    #     for with_data_augmentation in [True, False]
+    #     for lr in (5.6e-2, 3.1e-2, 1.8e-2, 1e-2, 5.6e-3, 3.1e-3, 1.8e-3, 1e-3)
+    # ]
 
     vgg16_pipelines = [
-        make_vgg16_pipeline(reporter.report_dir, input_size=32, with_data_augmentation=True, lr=lr)
+        make_vgg16_pipeline(report_dir, input_size=32, with_data_augmentation=False, lr=lr)
         for lr in (1e-5, 5e-4, 2e-4, 1e-4, 5e-3)
     ]
 
-    logging.info(f"Run `tensorboard --logdir={reporter.report_dir}` for realtime logs")
+    logging.info(f"Run `tensorboard --logdir={report_dir}` for realtime logs")
 
-    reporter.report([random_pipeline, *cnn_pipelines, *vgg16_pipelines])
+    _benchmarker.benchmark(
+        [random_pipeline,]
+    )
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py b/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py
index 943d412fa18d2fc5f17e433920078c948c54e100..394a1c460f442ce985380f5a6fa181d763e9924d 100644
--- a/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py
@@ -18,7 +18,18 @@ if __name__ == "__main__":
     _armor_digit_dataset = (
         make_armor_digit_dataset_generator()
         .from_roco_dataset(_roco_dataset)
-        .skip((1009 - 117) + (1000 - 86) + (1000 - 121) + (1000 - 138) + (1000 - 137))
+        .skip(
+            (1009 - 117)
+            + (1000 - 86)
+            + (1000 - 121)
+            + (1000 - 138)
+            + (1000 - 137)
+            + (1000 - 154)
+            + (1000 - 180)
+            + (1000 - 160)
+            + (1000 - 193)
+            + (1000 - 80)
+        )
         .cap(1000)
     )
 
diff --git a/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py b/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py
index 7b4ce98cf3c8aff86cf9a0ce0705b5267801d27c..4aafd34e781d3ff32eac9a774a7da97e8b3fb448 100644
--- a/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py
+++ b/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py
@@ -5,6 +5,7 @@ from polystar.common.filters.exclude_filter import ExcludeFilter
 from polystar.common.filters.filter_abc import FilterABC
 from polystar.common.filters.pass_through_filter import PassThroughFilter
 from research.common.dataset.cleaning.dataset_changes import DatasetChanges
+from research.common.datasets.image_dataset import FileImageDataset
 from research.common.datasets.image_file_dataset_builder import DirectoryDatasetBuilder
 from research.common.datasets.lazy_dataset import TargetT
 from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
@@ -28,12 +29,13 @@ class ArmorValueDatasetGenerator(Generic[TargetT]):
         self.task_name = task_name
         self.targets_filter = targets_filter or PassThroughFilter()
 
-    def from_roco_datasets(self, roco_datasets: List[ROCODatasetBuilder]) -> List[DirectoryDatasetBuilder[TargetT]]:
-        return [self.from_roco_dataset(roco_dataset) for roco_dataset in roco_datasets]
+    # FIXME signature inconsistency across methods
+    def from_roco_datasets(self, roco_datasets: List[ROCODatasetBuilder]) -> List[FileImageDataset[TargetT]]:
+        return [self.from_roco_dataset(roco_dataset).to_file_images().build() for roco_dataset in roco_datasets]
 
     def from_roco_dataset(self, roco_dataset_builder: ROCODatasetBuilder) -> DirectoryDatasetBuilder[TargetT]:
         cache_dir = roco_dataset_builder.main_dir / self.task_name
-        dataset_name = f"{roco_dataset_builder.name}_armor_{self.task_name}"
+        dataset_name = roco_dataset_builder.name
 
         ArmorValueDatasetCache(roco_dataset_builder, cache_dir, dataset_name, self.target_factory).generate_if_needed()
 
diff --git a/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py b/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py
index 0212bf8852c52a1f557b2673384dd5621f23072e..c3a4d34ac4b40d71ae7b4214450be2a5137a0be7 100644
--- a/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py
+++ b/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py
@@ -16,7 +16,7 @@ from polystar.common.utils.tensorflow import patch_tf_v2
 from polystar.common.view.plt_results_viewer import PltResultViewer
 from polystar.robots_at_robots.dependency_injection import make_injector
 from research.common.datasets.roco.zoo.roco_dataset_zoo import ROCODatasetsZoo
-from research.robots_at_robots.armor_color.baseline_experiments import (
+from research.robots_at_robots.armor_color.benchmark import (
     ArmorColorPipeline,
     MeanChannels,
     RedBlueComparisonClassifier,
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py b/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..045b13dc5da816a3ba4dfde5a42bf20c939b42eb
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+from typing import List
+
+from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
+from research.common.datasets.image_dataset import FileImageDataset
+from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
+from research.robots_at_robots.dataset.armor_value_dataset_generator import ArmorValueDatasetGenerator
+from research.robots_at_robots.evaluation.image_pipeline_evaluation_reporter import ImagePipelineEvaluationReporter
+from research.robots_at_robots.evaluation.image_pipeline_evaluator import ImageClassificationPipelineEvaluator
+from research.robots_at_robots.evaluation.metrics.f1 import F1Metric
+from research.robots_at_robots.evaluation.trainer import ImageClassificationPipelineTrainer
+
+
+@dataclass
+class Benchmarker:
+    def __init__(
+        self,
+        train_datasets: List[FileImageDataset],
+        test_datasets: List[FileImageDataset],
+        evaluation_project: str,
+        experiment_name: str,
+        classes: List,
+    ):
+        self.trainer = ImageClassificationPipelineTrainer(train_datasets)
+        self.evaluator = ImageClassificationPipelineEvaluator(train_datasets, test_datasets)
+        self.reporter = ImagePipelineEvaluationReporter(
+            evaluation_project, experiment_name, classes, other_metrics=[F1Metric()]
+        )
+
+    def benchmark(self, pipelines: List[ClassificationPipeline]):
+        self.trainer.train_pipelines(pipelines)
+        self.reporter.report(self.evaluator.evaluate_pipelines(pipelines))
+
+
+def make_armor_value_benchmarker(
+    train_roco_datasets: List[ROCODatasetBuilder],
+    test_roco_datasets: List[ROCODatasetBuilder],
+    evaluation_project: str,
+    experiment_name: str,
+    dataset_generator: ArmorValueDatasetGenerator,
+    classes: List,
+):
+    return Benchmarker(
+        dataset_generator.from_roco_datasets(train_roco_datasets),
+        dataset_generator.from_roco_datasets(test_roco_datasets),
+        evaluation_project=evaluation_project,
+        experiment_name=experiment_name,
+        classes=classes,
+    )
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
index 6cd66e57c0b66c116093d92276dedba7edafa84d..72996a9e2517ff298eca0dda612c3916fe8c55e3 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
@@ -1,10 +1,8 @@
 from collections import Counter
-from dataclasses import dataclass, field
-from enum import Enum
+from dataclasses import InitVar, dataclass, field
 from math import log
 from os.path import relpath
-from pathlib import Path
-from typing import Dict, Generic, Iterable, List, Optional, Tuple
+from typing import Generic, List, Optional, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -12,189 +10,233 @@ import seaborn as sns
 from matplotlib.axes import Axes, logging
 from matplotlib.figure import Figure
 from pandas import DataFrame
+from sklearn.metrics import classification_report, confusion_matrix
 
 from polystar.common.pipeline.classification.classification_pipeline import EnumT
-from polystar.common.pipeline.pipeline import Pipeline
 from polystar.common.utils.dataframe import Format, format_df_row, format_df_rows, make_formater
 from polystar.common.utils.markdown import MarkdownFile
 from polystar.common.utils.time import create_time_id
 from research.common.constants import DSET_DIR, EVALUATION_DIR
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.robots_at_robots.evaluation.image_pipeline_evaluator import (
-    ClassificationResults,
-    ImagePipelineEvaluator,
-    SetClassificationResults,
-)
-
-
-class Metric(Enum):
-    F1_WEIGHTED_AVG = ("f1-score", "weighted avg")
-    ACCURACY = ("precision", "accuracy")
-
-    def __str__(self):
-        if self.value[1] == "accuracy":
-            return "accuracy"
-        return " ".join(self.value)
-
-    def __getitem__(self, item):
-        return self.value[item]
+from research.robots_at_robots.evaluation.metrics.accuracy import AccuracyMetric
+from research.robots_at_robots.evaluation.metrics.metric_abc import MetricABC
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance, ClassificationPerformances
+from research.robots_at_robots.evaluation.set import Set
 
 
 @dataclass
 class ImagePipelineEvaluationReporter(Generic[EnumT]):
-    evaluator: ImagePipelineEvaluator[EnumT]
     evaluation_project: str
     experiment_name: str
-    main_metric: Metric = Metric.F1_WEIGHTED_AVG
-    other_metrics: List[Metric] = field(default_factory=lambda: [Metric.ACCURACY])
+    classes: List[EnumT]
+    main_metric: MetricABC = field(default_factory=AccuracyMetric)
+    other_metrics: InitVar[List[MetricABC]] = None
+    _mf: MarkdownFile = field(init=False)
+    _performances: ClassificationPerformances = field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self, other_metrics: List[MetricABC]):
         self.report_dir = EVALUATION_DIR / self.evaluation_project / f"{create_time_id()}_{self.experiment_name}"
+        self.all_metrics: List[MetricABC] = [self.main_metric] + (other_metrics or [])
 
-    def report(self, pipelines: Iterable[Pipeline]):
-        logging.info(f"Running experiment {self.experiment_name}")
-
-        pipeline2results = self.evaluator.evaluate_pipelines(pipelines)
+    def report(self, performances: ClassificationPerformances):
+        sns.set()
+        self._performances = performances
+        with MarkdownFile(self.report_dir / "report.md") as self._mf:
 
-        with MarkdownFile(self.report_dir / "report.md") as mf:
-            mf.title(f"Evaluation report")
-            self._report_datasets(mf)
-            self._report_aggregated_results(mf, pipeline2results, self.report_dir)
-            self._report_pipelines_results(mf, pipeline2results)
+            self._mf.title(f"Evaluation report")
+            self._report_datasets()
+            self._report_aggregated_results()
+            self._report_pipelines_results()
 
             logging.info(f"Report generated at file:///{self.report_dir/'report.md'}")
 
-    def _report_datasets(self, mf: MarkdownFile):
-        mf.title("Datasets", level=2)
+    def _report_datasets(self):
+        self._mf.title("Datasets", level=2)
 
-        mf.title("Training", level=3)
-        self._report_dataset(
-            mf, self.evaluator.train_roco_datasets, self.evaluator.train_dataset_sizes, self.evaluator.train_labels
-        )
+        self._mf.title("Training", level=3)
+        self._report_dataset(self._performances.train)
 
-        mf.title("Testing", level=3)
-        self._report_dataset(
-            mf, self.evaluator.test_roco_datasets, self.evaluator.test_dataset_sizes, self.evaluator.test_labels
-        )
+        self._mf.title("Testing", level=3)
+        self._report_dataset(self._performances.test)
 
-    @staticmethod
-    def _report_dataset(
-        mf: MarkdownFile, roco_datasets: List[ROCODatasetBuilder], dataset_sizes: List[int], labels: List[EnumT]
-    ):
-        total = len(labels)
-        labels = [str(label) for label in labels]
-        mf.paragraph(f"{total} images")
+    def _report_dataset(self, performances: ClassificationPerformances):
         df = (
-            DataFrame(
-                {
-                    dataset.name: Counter(labels[start:end])
-                    for dataset, start, end in zip(
-                        roco_datasets, np.cumsum([0] + dataset_sizes), np.cumsum(dataset_sizes)
-                    )
-                }
-            )
+            DataFrame({perf.dataset_name: Counter(perf.labels) for perf in performances})
             .fillna(0)
             .sort_index()
+            .astype(int)
         )
-        df["Total"] = sum([df[d.name] for d in roco_datasets])
-        df["Repartition"] = (df["Total"] / total).map("{:.1%}".format)
-        mf.table(df)
-
-    def _report_aggregated_results(
-        self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[EnumT]], report_dir: Path
-    ):
-        fig_scores, fig_times, aggregated_results = self._aggregate_results(pipeline2results)
-        aggregated_scores_image_name = "aggregated_scores.png"
-        fig_scores.savefig(report_dir / aggregated_scores_image_name)
-        aggregated_times_image_name = "aggregated_times.png"
-        fig_times.savefig(report_dir / aggregated_times_image_name)
-
-        mf.title("Aggregated results", level=2)
-        mf.image(aggregated_scores_image_name)
-        mf.image(aggregated_times_image_name)
-        mf.paragraph("On test set:")
-        mf.table(aggregated_results[aggregated_results["set"] == "test"].drop(columns="set"))
-        mf.paragraph("On train set:")
-        mf.table(aggregated_results[aggregated_results["set"] == "train"].drop(columns="set"))
-
-    def _report_pipelines_results(self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[EnumT]]):
-        for pipeline_name, results in sorted(
-            pipeline2results.items(),
-            key=lambda name_results: name_results[1].test_results.report[self.main_metric[1]][self.main_metric[0]],
+        df["Total"] = df.sum(axis=1)
+        df["Repartition"] = df["Total"] / df["Total"].sum()
+        df.loc["Total"] = df.sum()
+        df.loc["Repartition"] = df.loc["Total"] / df["Total"]["Total"]
+        dset_repartition = df.loc["Repartition"].map("{:.1%}".format)
+        df["Repartition"] = df["Repartition"].map("{:.1%}".format)
+        df.loc["Repartition"] = dset_repartition
+        df.at["Total", "Repartition"] = ""
+        df.at["Repartition", "Repartition"] = ""
+        df.at["Repartition", "Total"] = ""
+        self._mf.table(df)
+
+    def _report_aggregated_results(self):
+        fig_scores, fig_times = self._make_aggregate_figures()
+
+        self._mf.title("Aggregated results", level=2)
+        self._mf.figure(fig_scores, "aggregated_scores.png")
+        self._mf.figure(fig_times, "aggregated_times.png")
+
+        self._mf.paragraph("On test set:")
+        self._mf.table(self._make_aggregated_results_for_set(Set.TRAIN))
+        self._mf.paragraph("On train set:")
+        self._mf.table(self._make_aggregated_results_for_set(Set.TEST))
+
+    def _report_pipelines_results(self):
+        for pipeline_name, performances in sorted(
+            self._performances.group_by_pipeline().items(),
+            key=lambda name_perfs: self.main_metric(name_perfs[1].test.merge()),
             reverse=True,
         ):
-            self._report_pipeline_results(mf, pipeline_name, results)
+            self._report_pipeline_results(pipeline_name, performances)
 
-    def _report_pipeline_results(self, mf: MarkdownFile, pipeline_name: str, results: ClassificationResults[EnumT]):
-        mf.title(pipeline_name, level=2)
+    def _report_pipeline_results(self, pipeline_name: str, performances: ClassificationPerformances):
+        self._mf.title(pipeline_name, level=2)
 
-        mf.paragraph(results.full_pipeline_name)
+        self._mf.title("Train results", level=3)
+        self._report_pipeline_set_results(performances, Set.TRAIN)
 
-        mf.title("Train results", level=3)
-        ImagePipelineEvaluationReporter._report_pipeline_set_results(
-            mf, results.train_results, self.evaluator.train_images_paths
-        )
+        self._mf.title("Test results", level=3)
+        self._report_pipeline_set_results(performances, Set.TEST)
 
-        mf.title("Test results", level=3)
-        ImagePipelineEvaluationReporter._report_pipeline_set_results(
-            mf, results.test_results, self.evaluator.test_images_paths
-        )
+    def _report_pipeline_set_results(self, performances: ClassificationPerformances, set_: Set):
+        performances = performances.on_set(set_)
+        perf = performances.merge()
+
+        self._mf.title("Metrics", level=4)
+        self._report_pipeline_set_metrics(performances, perf, set_)
 
-    @staticmethod
-    def _report_pipeline_set_results(
-        mf: MarkdownFile, results: SetClassificationResults[EnumT], image_paths: List[Path]
+        self._mf.title("Confusion Matrix:", level=4)
+        self._report_pipeline_set_confusion_matrix(perf)
+
+        self._mf.title("25 Mistakes examples", level=4)
+        self._report_pipeline_set_mistakes(perf)
+
+    def _report_pipeline_set_metrics(
+        self, performances: ClassificationPerformances, perf: ClassificationPerformance, set_: Set
     ):
-        mf.title("Metrics", level=4)
-        mf.paragraph(f"Inference time: {results.mean_inference_time: .2e} s/img")
-        df = DataFrame(results.report)
+        fig: Figure = plt.figure(figsize=(9, 6))
+        ax: Axes = fig.subplots()
+        sns.barplot(
+            data=DataFrame(
+                [
+                    {"dataset": performance.dataset_name, "score": metric(performance), "metric": metric.name}
+                    for performance in performances
+                    for metric in self.all_metrics
+                ]
+                + [
+                    {"dataset": performance.dataset_name, "score": len(performance) / len(perf), "metric": "support"}
+                    for performance in performances
+                ]
+            ),
+            x="dataset",
+            hue="metric",
+            y="score",
+            ax=ax,
+        )
+        ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")
+        pipeline_name = performances.performances[0].pipeline_name
+        fig.suptitle(f"{pipeline_name} performance across {set_} datasets")
+        _format_ax(ax, "{:.1%}", limits=(0, 1))
+        fig.tight_layout()
+        self._mf.figure(fig, f"{pipeline_name}_{set_}.png")
+
+        self._mf.paragraph(f"Inference time: {perf.mean_inference_time: .2e} s/img")
+        df = DataFrame(classification_report(perf.labels, perf.predictions, output_dict=True))
         format_df_rows(df, ["precision", "recall", "f1-score"], "{:.1%}")
         format_df_row(df, "support", int)
-        mf.table(df)
-        mf.title("Confusion Matrix:", level=4)
-        mf.table(DataFrame(results.confusion_matrix, index=results.unique_labels, columns=results.unique_labels))
-        mf.title("25 Mistakes examples", level=4)
-        mistakes_idx = np.random.choice(results.mistakes, min(len(results.mistakes), 25), replace=False)
+        self._mf.table(df)
+
+    def _report_pipeline_set_confusion_matrix(self, perf: ClassificationPerformance):
+        self._mf.table(
+            DataFrame(
+                confusion_matrix(perf.labels, perf.predictions), index=perf.unique_labels, columns=perf.unique_labels
+            )
+        )
+
+    def _report_pipeline_set_mistakes(self, perf: ClassificationPerformance):
+        mistakes = perf.mistakes
+        mistakes_idx = np.random.choice(mistakes, min(len(mistakes), 25), replace=False)
         relative_paths = [
-            f"![img]({relpath(str(image_paths[idx]), str(mf.markdown_path.parent))})" for idx in mistakes_idx
+            f"![img]({relpath(str(perf.examples[idx].path), str(self._mf.markdown_path.parent))})"
+            for idx in mistakes_idx
+        ]
+        images_names = [
+            f"[{perf.examples[idx].path.relative_to(DSET_DIR)}]"
+            f"({relpath(str(perf.examples[idx].path), str(self._mf.markdown_path.parent))})"
+            for idx in mistakes_idx
         ]
-        images_names = [image_paths[idx].relative_to(DSET_DIR) for idx in mistakes_idx]
-        mf.table(
+        self._mf.table(
             DataFrame(
                 {
                     "images": relative_paths,
-                    "labels": map(str, results.labels[mistakes_idx]),
-                    "predictions": map(str, results.predictions[mistakes_idx]),
+                    "labels": perf.labels[mistakes_idx],
+                    "predictions": perf.predictions[mistakes_idx],
+                    **{
+                        f"p({str(label)})": map("{:.1%}".format, perf.proba[mistakes_idx, i])
+                        for i, label in enumerate(self.classes)
+                    },
                     "image names": images_names,
                 }
             ).set_index("images")
         )
 
-    def _aggregate_results(
-        self, pipeline2results: Dict[str, ClassificationResults[EnumT]]
-    ) -> Tuple[Figure, Figure, DataFrame]:
-        sns.set_style()
-        sets = ["train", "test"]
+    def _make_aggregate_figures(self) -> Tuple[Figure, Figure]:
         df = DataFrame.from_records(
             [
                 {
-                    "pipeline": pipeline_name,
-                    str(self.main_metric): results.on_set(set_).report[self.main_metric[1]][self.main_metric[0]],
-                    "inference time": results.on_set(set_).mean_inference_time,
-                    "set": set_,
+                    "dataset": perf.dataset_name,
+                    "pipeline": perf.pipeline_name,
+                    self.main_metric.name: self.main_metric(perf),
+                    "time": perf.mean_inference_time,
+                    "set": perf.set_.name.lower(),
+                    "support": len(perf),
                 }
-                for pipeline_name, results in pipeline2results.items()
-                # for metric in [self.main_metric]  # + self.other_metrics
-                for set_ in sets
+                for perf in self._performances
             ]
-        ).sort_values(["set", str(self.main_metric)], ascending=[True, False])
+        ).sort_values(["set", self.main_metric.name], ascending=[True, False])
 
+        df[f"{self.main_metric.name} "] = list(zip(df[self.main_metric.name], df.support))
+        df["time "] = list(zip(df[self.main_metric.name], df.support))
+
+        return (
+            _cat_pipeline_results(df, f"{self.main_metric.name} ", "{:.1%}", limits=(0, 1)),
+            _cat_pipeline_results(df, "time ", "{:.2e}", log_scale=True),
+        )
+
+    def _make_aggregated_results_for_set(self, set_: Set) -> DataFrame:
+        pipeline2performances = self._performances.on_set(set_).group_by_pipeline()
+        pipeline2performance = {
+            pipeline_name: performances.merge() for pipeline_name, performances in pipeline2performances.items()
+        }
         return (
-            _cat_pipeline_results(df, str(self.main_metric), "{:.1%}", limits=(0, 1)),
-            _cat_pipeline_results(df, "inference time", "{:.2e}", log_scale=True),
-            df.set_index("pipeline"),
+            DataFrame(
+                [
+                    {
+                        "pipeline": pipeline_name,
+                        self.main_metric.name: self.main_metric(performance),
+                        "inference time": performance.mean_inference_time,
+                    }
+                    for pipeline_name, performance in pipeline2performance.items()
+                ]
+            )
+            .set_index("pipeline")
+            .sort_values(self.main_metric.name, ascending=False)
         )
 
 
+def weighted_mean(x, **kws):
+    val, weight = map(np.asarray, zip(*x))
+    return (val * weight).sum() / weight.sum()
+
+
 def _cat_pipeline_results(
     df: DataFrame, y: str, fmt: str, limits: Optional[Tuple[float, float]] = None, log_scale: bool = False
 ) -> Figure:
@@ -208,6 +250,8 @@ def _cat_pipeline_results(
         legend=False,
         col_order=["test", "train"],
         height=10,
+        estimator=weighted_mean,
+        orient="v",
     )
     grid.set_xticklabels(rotation=30, ha="right")
 
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
index 2a2370602aaf2b60f7df81bd872808fedf8a9043..266de795dd14914537cade66ee3f0cd560aaf039 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
@@ -1,111 +1,57 @@
-import logging
-from dataclasses import dataclass
 from enum import Enum
-from pathlib import Path
+from itertools import chain
 from time import time
-from typing import Dict, Generic, Iterable, List, Sequence, Tuple
+from typing import Generic, Iterable, List
 
 import numpy as np
-from memoized_property import memoized_property
-from sklearn.metrics import classification_report, confusion_matrix
-from tqdm import tqdm
 
-from polystar.common.models.image import Image, load_images
-from polystar.common.pipeline.pipeline import Pipeline
+from polystar.common.models.image import file_images_to_images
+from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
+from polystar.common.utils.iterable_utils import flatten
+from research.common.datasets.image_dataset import FileImageDataset
 from research.common.datasets.lazy_dataset import TargetT
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.common.datasets.union_dataset import UnionDataset
-from research.robots_at_robots.dataset.armor_value_dataset_generator import ArmorValueDatasetGenerator
+from research.robots_at_robots.evaluation.performance import (
+    ClassificationPerformance,
+    ClassificationPerformances,
+    ContextualizedClassificationPerformance,
+)
+from research.robots_at_robots.evaluation.set import Set
 
 
-@dataclass
-class SetClassificationResults(Generic[TargetT]):
-    labels: np.ndarray
-    predictions: np.ndarray
-    mean_inference_time: float
-
-    @property
-    def report(self) -> Dict:
-        return classification_report(self.labels, self.predictions, output_dict=True)
-
-    @property
-    def confusion_matrix(self) -> Dict:
-        return confusion_matrix(self.labels, self.predictions)
-
-    @property
-    def mistakes(self) -> Sequence[int]:
-        return np.where(self.labels != self.predictions)[0]
-
-    @memoized_property
-    def unique_labels(self) -> List[TargetT]:
-        return sorted(set(self.labels) | set(self.predictions))
-
-
-@dataclass
-class ClassificationResults(Generic[TargetT]):
-    train_results: SetClassificationResults[TargetT]
-    test_results: SetClassificationResults[TargetT]
-    full_pipeline_name: str
-
-    def on_set(self, set_: str) -> SetClassificationResults[TargetT]:
-        if set_ is "train":
-            return self.train_results
-        return self.test_results
-
-
-class ImagePipelineEvaluator(Generic[TargetT]):
+class ImageClassificationPipelineEvaluator(Generic[TargetT]):
     def __init__(
-        self,
-        train_roco_datasets: List[ROCODatasetBuilder],
-        test_roco_datasets: List[ROCODatasetBuilder],
-        image_dataset_generator: ArmorValueDatasetGenerator[TargetT],
+        self, train_datasets: List[FileImageDataset], test_datasets: List[FileImageDataset],
     ):
-        logging.info("Loading data")
-        self.train_roco_datasets = train_roco_datasets
-        self.test_roco_datasets = test_roco_datasets
-        (self.train_images_paths, self.train_images, self.train_labels, self.train_dataset_sizes) = load_datasets(
-            train_roco_datasets, image_dataset_generator
-        )
-        (self.test_images_paths, self.test_images, self.test_labels, self.test_dataset_sizes) = load_datasets(
-            test_roco_datasets, image_dataset_generator
-        )
-
-    def evaluate_pipelines(self, pipelines: Iterable[Pipeline]) -> Dict[str, ClassificationResults]:
-        tqdm_pipelines = tqdm(pipelines, desc="Training", unit="pipeline")
-        return {str(pipeline): self.evaluate_pipeline(pipeline, tqdm_pipelines) for pipeline in tqdm_pipelines}
+        self.train_datasets = train_datasets
+        self.test_datasets = test_datasets
 
-    def evaluate_pipeline(self, pipeline: Pipeline, tqdm_pipelines: tqdm) -> ClassificationResults:
-        tqdm_pipelines.set_postfix({"pipeline": pipeline.name}, True)
-        pipeline.fit(self.train_images, self.train_labels)
+    def evaluate_pipelines(self, pipelines: Iterable[ClassificationPipeline]) -> ClassificationPerformances:
+        return ClassificationPerformances(flatten(self._evaluate_pipeline(pipeline) for pipeline in pipelines))
 
-        train_results = self._evaluate_pipeline_on_set(pipeline, self.train_images, self.train_labels)
-        test_results = self._evaluate_pipeline_on_set(pipeline, self.test_images, self.test_labels)
-
-        return ClassificationResults(
-            train_results=train_results, test_results=test_results, full_pipeline_name=repr(pipeline),
+    def _evaluate_pipeline(self, pipeline: ClassificationPipeline) -> Iterable[ContextualizedClassificationPerformance]:
+        return chain(
+            self._evaluate_pipeline_on_set(pipeline, self.train_datasets, Set.TRAIN),
+            self._evaluate_pipeline_on_set(pipeline, self.test_datasets, Set.TEST),
         )
 
     @staticmethod
     def _evaluate_pipeline_on_set(
-        pipeline: Pipeline, images: List[Image], labels: List[TargetT]
-    ) -> SetClassificationResults:
-        t = time()
-        preds = pipeline.predict(images)
-        mean_time = (time() - t) / len(images)
-        return SetClassificationResults(_labels_to_numpy(labels), _labels_to_numpy(preds), mean_time)
-
-
-def load_datasets(
-    roco_datasets: List[ROCODatasetBuilder], image_dataset_generator: ArmorValueDatasetGenerator[TargetT],
-) -> Tuple[List[Path], List[Image], List[TargetT], List[int]]:
-    # TODO we should receive a list of FileImageDataset
-    datasets = [builder.build() for builder in image_dataset_generator.from_roco_datasets(roco_datasets)]
-    dataset_sizes = [len(d) for d in datasets]
-
-    dataset = UnionDataset(datasets)
-    paths, targets = list(dataset.examples), list(dataset.targets)
-    images = list(load_images(paths))
-    return paths, images, targets, dataset_sizes
+        pipeline: ClassificationPipeline, datasets: List[FileImageDataset], set_: Set
+    ) -> Iterable[ContextualizedClassificationPerformance]:
+        for dataset in datasets:
+            t = time()
+            proba, classes = pipeline.predict_proba_and_classes(file_images_to_images(dataset.examples))
+            mean_time = (time() - t) / len(dataset)
+            yield ContextualizedClassificationPerformance(
+                examples=dataset.examples,
+                labels=_labels_to_numpy(dataset.targets),
+                predictions=_labels_to_numpy(classes),
+                proba=proba,
+                mean_inference_time=mean_time,
+                set_=set_,
+                dataset_name=dataset.name,
+                pipeline_name=pipeline.name,
+            )
 
 
 def _labels_to_numpy(labels: List[Enum]) -> np.ndarray:
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/__init__.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccfe9c73bdda26c7c0624fa3220d30e335a76506
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py
@@ -0,0 +1,11 @@
+from research.robots_at_robots.evaluation.metrics.metric_abc import MetricABC
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance
+
+
+class AccuracyMetric(MetricABC):
+    def __call__(self, performance: ClassificationPerformance) -> float:
+        return (performance.labels == performance.predictions).mean()
+
+    @property
+    def name(self) -> str:
+        return "accuracy"
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd5f48ae0202e7b917f58e7d6f94b1713de2caff
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py
@@ -0,0 +1,30 @@
+from enum import Enum, auto
+
+from sklearn.metrics import f1_score
+
+from research.robots_at_robots.evaluation.metrics.metric_abc import MetricABC
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance
+
+
+class F1Strategy(Enum):
+    MICRO = auto()
+    MACRO = auto()
+    SAMPLES = auto()
+    WEIGHTED = auto()
+
+    def __repr__(self):
+        return self.name.lower()
+
+    __str__ = __repr__
+
+
+class F1Metric(MetricABC):
+    def __init__(self, strategy: F1Strategy = F1Strategy.MACRO):
+        self.strategy = strategy
+
+    def __call__(self, performance: ClassificationPerformance) -> float:
+        return f1_score(performance.labels, performance.predictions, average=str(self.strategy))
+
+    @property
+    def name(self) -> str:
+        return f"f1 {self.strategy}"
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f25a0c3f122a311d3495e74a5d02a3d9eff224e2
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py
@@ -0,0 +1,17 @@
+from abc import ABC, abstractmethod
+
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance
+
+
+class MetricABC(ABC):
+    @abstractmethod
+    def __call__(self, performance: ClassificationPerformance) -> float:
+        pass
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        pass
+
+    def __repr__(self):
+        return self.name
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/performance.py b/robots-at-robots/research/robots_at_robots/evaluation/performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c0bc765a301b1bbe3c956948fa3351052cce6b
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/performance.py
@@ -0,0 +1,79 @@
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Sequence
+
+import numpy as np
+from memoized_property import memoized_property
+
+from polystar.common.filters.filter_abc import FilterABC
+from polystar.common.models.image import FileImage
+from polystar.common.utils.iterable_utils import flatten, group_by
+from research.robots_at_robots.evaluation.set import Set
+
+
+@dataclass
+class ClassificationPerformance:
+    examples: List[FileImage]
+    labels: np.ndarray
+    predictions: np.ndarray
+    proba: np.ndarray
+    mean_inference_time: float
+
+    @property
+    def mistakes(self) -> Sequence[int]:
+        return np.where(self.labels != self.predictions)[0]
+
+    @memoized_property
+    def unique_labels(self):
+        return sorted(set(self.labels) | set(self.predictions))
+
+    def __len__(self) -> int:
+        return len(self.labels)
+
+
+@dataclass
+class ContextualizedClassificationPerformance(ClassificationPerformance):
+    set_: Set
+    dataset_name: str
+    pipeline_name: str
+
+
+@dataclass
+class ClassificationPerformances(Iterable[ContextualizedClassificationPerformance]):
+    performances: List[ContextualizedClassificationPerformance]
+
+    @property
+    def train(self) -> "ClassificationPerformances":
+        return self.on_set(Set.TRAIN)
+
+    @property
+    def test(self) -> "ClassificationPerformances":
+        return self.on_set(Set.TEST)
+
+    def on_set(self, set_: Set) -> "ClassificationPerformances":
+        return ClassificationPerformances(SetClassificationPerformanceFilter(set_).filter(self.performances))
+
+    def group_by_pipeline(self) -> Dict[str, "ClassificationPerformances"]:
+        return {
+            name: ClassificationPerformances(performances)
+            for name, performances in group_by(self, lambda p: p.pipeline_name).items()
+        }
+
+    def merge(self) -> ClassificationPerformance:
+        return ClassificationPerformance(
+            examples=flatten(p.examples for p in self),
+            labels=np.concatenate([p.labels for p in self]),
+            predictions=np.concatenate([p.predictions for p in self]),
+            proba=np.concatenate([p.proba for p in self]),
+            mean_inference_time=np.average([p.mean_inference_time for p in self], weights=[len(p) for p in self]),
+        )
+
+    def __iter__(self):
+        return iter(self.performances)
+
+
+@dataclass
+class SetClassificationPerformanceFilter(FilterABC[ContextualizedClassificationPerformance]):
+    set_: Set
+
+    def validate_single(self, perf: ContextualizedClassificationPerformance) -> bool:
+        return perf.set_ is self.set_
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/set.py b/robots-at-robots/research/robots_at_robots/evaluation/set.py
new file mode 100644
index 0000000000000000000000000000000000000000..6175a68587d575e3d18cacf456ab99da45925220
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/set.py
@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+
+
+@dataclass
+class Set(Enum):
+    TRAIN = auto()
+    VALIDATION = auto()
+    TEST = auto()
+
+    def __repr__(self):
+        return self.name.lower()
+
+    __str__ = __repr__
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/trainer.py b/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6731cd00fd39b312d7e878eecc55fbb4d85adfb6
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
@@ -0,0 +1,25 @@
+from typing import Generic, List
+
+from tqdm import tqdm
+
+from polystar.common.models.image import file_images_to_images
+from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
+from research.common.datasets.image_dataset import FileImageDataset
+from research.common.datasets.lazy_dataset import TargetT
+from research.common.datasets.union_dataset import UnionDataset
+
+
+class ImageClassificationPipelineTrainer(Generic[TargetT]):
+    def __init__(self, training_datasets: List[FileImageDataset]):
+        train_dataset = UnionDataset(training_datasets)
+        self.images = file_images_to_images(train_dataset.examples)
+        self.labels = train_dataset.targets
+
+    def train_pipeline(self, pipeline: ClassificationPipeline):
+        pipeline.fit(self.images, self.labels)
+
+    def train_pipelines(self, pipelines: List[ClassificationPipeline]):
+        tqdm_pipelines = tqdm(pipelines, desc="Training Pipelines")
+        for pipeline in tqdm_pipelines:
+            tqdm_pipelines.set_postfix({"pipeline": pipeline.name}, True)
+            self.train_pipeline(pipeline)