diff --git a/common/polystar/common/pipeline/classification/classification_pipeline.py b/common/polystar/common/pipeline/classification/classification_pipeline.py
index 99c85f5000881185af3362c2856a856f06fcee71..56086c97eaa11a2d85369f04b1e2920fcda610dc 100644
--- a/common/polystar/common/pipeline/classification/classification_pipeline.py
+++ b/common/polystar/common/pipeline/classification/classification_pipeline.py
@@ -22,7 +22,9 @@ class ClassificationPipeline(Pipeline, Generic[IT, EnumT], ABC):
     def classifier(self) -> ClassifierABC:
         return self.steps[-1][-1]
 
-    def fit(self, x: Sequence[IT], y: List[EnumT], **fit_params):
+    def fit(self, x: Sequence[IT], y: List[EnumT], validation_size: int = 0, **fit_params):
+        if isinstance(self.classifier, ClassifierABC):
+            fit_params[f"{self.classifier.__class__.__name__}__validation_size"] = validation_size
         y_indices = _labels_to_indices(y)
         return super().fit(x, y_indices, **fit_params)
 
diff --git a/common/polystar/common/pipeline/classification/classifier_abc.py b/common/polystar/common/pipeline/classification/classifier_abc.py
index 64c89b96e323ee85e9309fd2ed757ac0d418de21..3016baf348e6d388ec4d5e608e893857cc75cbb7 100644
--- a/common/polystar/common/pipeline/classification/classifier_abc.py
+++ b/common/polystar/common/pipeline/classification/classifier_abc.py
@@ -10,7 +10,7 @@ from polystar.common.utils.named_mixin import NamedMixin
 class ClassifierABC(BaseEstimator, NamedMixin, Generic[IT], ABC):
     n_classes: int
 
-    def fit(self, examples: List[IT], label_indices: List[int]) -> "ClassifierABC":
+    def fit(self, examples: List[IT], label_indices: List[int], validation_size: int) -> "ClassifierABC":
         return self
 
     @abstractmethod
diff --git a/common/polystar/common/pipeline/classification/random_model.py b/common/polystar/common/pipeline/classification/random_model.py
index d6a13a9520b3e841b6912446d8305784d13ae789..9080f7afa45fd11b96af805348de02d6896e8257 100644
--- a/common/polystar/common/pipeline/classification/random_model.py
+++ b/common/polystar/common/pipeline/classification/random_model.py
@@ -11,7 +11,7 @@ class RandomClassifier(RuleBasedClassifierABC):
     def predict(self, examples: np.ndarray) -> List[int]:
         return choice(range(self.n_classes), size=len(examples), replace=True, p=self.weights_)
 
-    def fit(self, examples: List, label_indices: List[int]) -> "RandomClassifier":
+    def fit(self, examples: List, label_indices: List[int], validation_size: int) -> "RandomClassifier":
         indices2counts = Counter(label_indices)
         self.weights_ = [indices2counts[i] / len(label_indices) for i in range(self.n_classes)]
         return self
diff --git a/common/polystar/common/utils/markdown.py b/common/polystar/common/utils/markdown.py
index 3997375130872f443774927e0585f9e2d76fc552..1aa5e5f24ce51ac7d3233def6f4c4412985d2248 100644
--- a/common/polystar/common/utils/markdown.py
+++ b/common/polystar/common/utils/markdown.py
@@ -33,7 +33,7 @@ class MarkdownFile:
         return self
 
     def image(self, relative_path: str, alt: str = "img") -> "MarkdownFile":
-        self.paragraph(f"![{alt}]({relative_path})")
+        self.paragraph(f"![{alt}]({str(relative_path).replace(' ', '%20')})")
         return self
 
     def figure(self, figure: Figure, name: str, alt: str = "img"):
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
index a01bf0de068690b9dfb1582a668874aa95739365..37a9e35e4a2601bc651b4cb7665f0aea43be73f8 100644
--- a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
+++ b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
@@ -7,11 +7,15 @@ from research.robots_at_robots.evaluation.benchmark import make_armor_value_benc
 
 
 def make_armor_color_benchmarker(
-    train_roco_datasets: List[ROCODatasetBuilder], test_roco_datasets: List[ROCODatasetBuilder], experiment_name: str
+    train_roco_datasets: List[ROCODatasetBuilder],
+    validation_roco_datasets: List[ROCODatasetBuilder],
+    test_roco_datasets: List[ROCODatasetBuilder],
+    experiment_name: str,
 ):
     dataset_generator = make_armor_color_dataset_generator()
     return make_armor_value_benchmarker(
         train_roco_datasets=train_roco_datasets,
+        validation_roco_datasets=validation_roco_datasets,
         test_roco_datasets=test_roco_datasets,
         evaluation_project="armor-color",
         experiment_name=experiment_name,
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py b/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
index 1ac6f2b9ce966d91659b005f6206fa35f80c7647..441fb0d709354c2190877c8b3d5a293071b98aad 100644
--- a/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
+++ b/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
@@ -37,19 +37,20 @@ if __name__ == "__main__":
     logging.getLogger().setLevel("INFO")
 
     _benchmarker = make_armor_color_benchmarker(
-        [
+        train_roco_datasets=[
             ROCODatasetsZoo.TWITCH.T470150052,
             ROCODatasetsZoo.TWITCH.T470152289,
             ROCODatasetsZoo.TWITCH.T470149568,
             ROCODatasetsZoo.TWITCH.T470151286,
         ],
-        [
+        validation_roco_datasets=[],
+        test_roco_datasets=[
             ROCODatasetsZoo.TWITCH.T470152838,
             ROCODatasetsZoo.TWITCH.T470153081,
             ROCODatasetsZoo.TWITCH.T470158483,
             ROCODatasetsZoo.TWITCH.T470152730,
         ],
-        "test",
+        experiment_name="test",
     )
 
     red_blue_comparison_pipeline = ArmorColorPipeline.from_pipes(
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
index f4792c43adcec2192b74a2457662cae09d028681..d55d54a4202a2b3cc771a6dc6b64d972c25668a4 100644
--- a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
@@ -7,11 +7,15 @@ from research.robots_at_robots.evaluation.benchmark import make_armor_value_benc
 
 
 def make_armor_digit_benchmarker(
-    train_roco_datasets: List[ROCODatasetBuilder], test_roco_datasets: List[ROCODatasetBuilder], experiment_name: str
+    train_roco_datasets: List[ROCODatasetBuilder],
+    validation_roco_datasets: List[ROCODatasetBuilder],
+    test_roco_datasets: List[ROCODatasetBuilder],
+    experiment_name: str,
 ):
     dataset_generator = make_armor_digit_dataset_generator()
     return make_armor_value_benchmarker(
         train_roco_datasets=train_roco_datasets,
+        validation_roco_datasets=validation_roco_datasets,
         test_roco_datasets=test_roco_datasets,
         evaluation_project="armor-digit",
         experiment_name=experiment_name,
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py b/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
index 1b48d0ea6c6c17c276ad28d3e1687a2446f42a51..1c7a80f81c2449e352fc7795b6e09b6053f9448e 100644
--- a/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
@@ -29,7 +29,8 @@ class ArmorDigitPipeline(ClassificationPipeline):
 
 
 class KerasClassifier(ClassifierABC):
-    def __init__(self, model: Model, optimizer, logs_dir: Path, with_data_augmentation: bool):
+    def __init__(self, model: Model, optimizer, logs_dir: Path, with_data_augmentation: bool, batch_size: int = 32):
+        self.batch_size = batch_size
         self.logs_dir = logs_dir
         self.with_data_augmentation = with_data_augmentation
         self.model = model
@@ -41,19 +42,17 @@ class KerasClassifier(ClassifierABC):
             return ImageDataGenerator()
         return ImageDataGenerator(rotation_range=45, zoom_range=[0.8, 1])  # brightness_range=[0.7, 1.4]
 
-    def fit(self, images: List[Image], labels: List[int]) -> "KerasClassifier":
-        n_val: int = 371  # FIXME
+    def fit(self, images: List[Image], labels: List[int], validation_size: int) -> "KerasClassifier":
         images = asarray(images)
         labels = to_categorical(asarray(labels), 5)  # FIXME
-        train_images, train_labels = images[:-n_val], labels[:-n_val]
-        val_images, val_labels = images[-n_val:], labels[-n_val:]
+        train_images, train_labels = images[:-validation_size], labels[:-validation_size]
+        val_images, val_labels = images[-validation_size:], labels[-validation_size:]
 
-        batch_size = 32  # FIXME
-        train_generator = self.train_data_gen.flow(train_images, train_labels, batch_size=batch_size, shuffle=True)
+        train_generator = self.train_data_gen.flow(train_images, train_labels, batch_size=self.batch_size, shuffle=True)
 
         self.model.fit(
             x=train_generator,
-            steps_per_epoch=len(train_images) / batch_size,
+            steps_per_epoch=len(train_images) / self.batch_size,
             validation_data=(val_images, val_labels),
             epochs=300,
             callbacks=[
@@ -102,7 +101,7 @@ def make_digits_cnn_pipeline(
 ) -> ArmorDigitPipeline:
     name = (
         f"cnn - ({input_size}) - lr {lr} - "
-        + " / ".join("_".join(map(str, sizes)) for sizes in conv_blocks)
+        + " ".join("_".join(map(str, sizes)) for sizes in conv_blocks)
         + (" - with_data_augm" * with_data_augmentation)
     )
     input_size = (input_size, input_size)
@@ -179,8 +178,8 @@ if __name__ == "__main__":
             ROCODatasetsZoo.TWITCH.T470150052,
             ROCODatasetsZoo.TWITCH.T470149568,
             ROCODatasetsZoo.TWITCH.T470151286,
-            ROCODatasetsZoo.TWITCH.T470152289,
         ],
+        validation_roco_datasets=[ROCODatasetsZoo.TWITCH.T470152289],
         test_roco_datasets=[
             ROCODatasetsZoo.TWITCH.T470152838,
             ROCODatasetsZoo.TWITCH.T470153081,
@@ -190,12 +189,12 @@ if __name__ == "__main__":
         experiment_name="test-benchmarker",
     )
 
-    random_pipeline = ArmorDigitPipeline.from_pipes([RandomClassifier()], name="random")
+    _report_dir = _benchmarker.reporter.report_dir
 
-    report_dir = _benchmarker.reporter.report_dir
-    cnn_pipelines = [
+    _random_pipeline = ArmorDigitPipeline.from_pipes([RandomClassifier()], name="random")
+    _cnn_pipelines = [
         make_digits_cnn_pipeline(
-            32, ((32, 32), (64, 64)), report_dir, with_data_augmentation=with_data_augmentation, lr=lr,
+            32, ((32, 32), (64, 64)), _report_dir, with_data_augmentation=with_data_augmentation, lr=lr,
         )
         for with_data_augmentation in [False]
         for lr in [2.5e-2, 1.6e-2, 1e-2, 6.3e-3, 4e-4]
@@ -209,12 +208,10 @@ if __name__ == "__main__":
     # ]
 
     vgg16_pipelines = [
-        make_vgg16_pipeline(report_dir, input_size=32, with_data_augmentation=False, lr=lr)
+        make_vgg16_pipeline(_report_dir, input_size=32, with_data_augmentation=False, lr=lr)
         for lr in (1e-5, 5e-4, 2e-4, 1e-4, 5e-3)
     ]
 
-    logging.info(f"Run `tensorboard --logdir={report_dir}` for realtime logs")
+    logging.info(f"Run `tensorboard --logdir={_report_dir}` for realtime logs")
 
-    _benchmarker.benchmark(
-        [random_pipeline,]
-    )
+    _benchmarker.benchmark([_random_pipeline] + _cnn_pipelines)
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py b/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
index 045b13dc5da816a3ba4dfde5a42bf20c939b42eb..b9cb996993c64e9dde6afe989cbdcc312148ef3c 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
@@ -16,13 +16,14 @@ class Benchmarker:
     def __init__(
         self,
         train_datasets: List[FileImageDataset],
+        validation_datasets: List[FileImageDataset],
         test_datasets: List[FileImageDataset],
         evaluation_project: str,
         experiment_name: str,
         classes: List,
     ):
-        self.trainer = ImageClassificationPipelineTrainer(train_datasets)
-        self.evaluator = ImageClassificationPipelineEvaluator(train_datasets, test_datasets)
+        self.trainer = ImageClassificationPipelineTrainer(train_datasets, validation_datasets)
+        self.evaluator = ImageClassificationPipelineEvaluator(train_datasets, validation_datasets, test_datasets)
         self.reporter = ImagePipelineEvaluationReporter(
             evaluation_project, experiment_name, classes, other_metrics=[F1Metric()]
         )
@@ -34,6 +35,7 @@ class Benchmarker:
 
 def make_armor_value_benchmarker(
     train_roco_datasets: List[ROCODatasetBuilder],
+    validation_roco_datasets: List[ROCODatasetBuilder],
     test_roco_datasets: List[ROCODatasetBuilder],
     evaluation_project: str,
     experiment_name: str,
@@ -41,8 +43,9 @@ def make_armor_value_benchmarker(
     classes: List,
 ):
     return Benchmarker(
-        dataset_generator.from_roco_datasets(train_roco_datasets),
-        dataset_generator.from_roco_datasets(test_roco_datasets),
+        train_datasets=dataset_generator.from_roco_datasets(train_roco_datasets),
+        validation_datasets=dataset_generator.from_roco_datasets(validation_roco_datasets),
+        test_datasets=dataset_generator.from_roco_datasets(test_roco_datasets),
         evaluation_project=evaluation_project,
         experiment_name=experiment_name,
         classes=classes,
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
index 72996a9e2517ff298eca0dda612c3916fe8c55e3..bc36ef8a4e16f206d4b9b64c9eaaa7b6ad4463a1 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
@@ -52,8 +52,11 @@ class ImagePipelineEvaluationReporter(Generic[EnumT]):
     def _report_datasets(self):
         self._mf.title("Datasets", level=2)
 
-        self._mf.title("Training", level=3)
+        self._mf.title("Train-val", level=3)
+        self._mf.paragraph("Train")
         self._report_dataset(self._performances.train)
+        self._mf.paragraph("Val")
+        self._report_dataset(self._performances.validation)
 
         self._mf.title("Testing", level=3)
         self._report_dataset(self._performances.test)
@@ -85,9 +88,11 @@ class ImagePipelineEvaluationReporter(Generic[EnumT]):
         self._mf.figure(fig_times, "aggregated_times.png")
 
         self._mf.paragraph("On test set:")
-        self._mf.table(self._make_aggregated_results_for_set(Set.TRAIN))
-        self._mf.paragraph("On train set:")
         self._mf.table(self._make_aggregated_results_for_set(Set.TEST))
+        self._mf.paragraph("On validation set:")
+        self._mf.table(self._make_aggregated_results_for_set(Set.VALIDATION))
+        self._mf.paragraph("On train set:")
+        self._mf.table(self._make_aggregated_results_for_set(Set.TRAIN))
 
     def _report_pipelines_results(self):
         for pipeline_name, performances in sorted(
@@ -100,12 +105,15 @@ class ImagePipelineEvaluationReporter(Generic[EnumT]):
     def _report_pipeline_results(self, pipeline_name: str, performances: ClassificationPerformances):
         self._mf.title(pipeline_name, level=2)
 
-        self._mf.title("Train results", level=3)
-        self._report_pipeline_set_results(performances, Set.TRAIN)
-
         self._mf.title("Test results", level=3)
         self._report_pipeline_set_results(performances, Set.TEST)
 
+        self._mf.title("Validation results", level=3)
+        self._report_pipeline_set_results(performances, Set.VALIDATION)
+
+        self._mf.title("Train results", level=3)
+        self._report_pipeline_set_results(performances, Set.TRAIN)
+
     def _report_pipeline_set_results(self, performances: ClassificationPerformances, set_: Set):
         performances = performances.on_set(set_)
         perf = performances.merge()
@@ -204,7 +212,7 @@ class ImagePipelineEvaluationReporter(Generic[EnumT]):
         ).sort_values(["set", self.main_metric.name], ascending=[True, False])
 
         df[f"{self.main_metric.name} "] = list(zip(df[self.main_metric.name], df.support))
-        df["time "] = list(zip(df[self.main_metric.name], df.support))
+        df["time "] = list(zip(df.time, df.support))
 
         return (
             _cat_pipeline_results(df, f"{self.main_metric.name} ", "{:.1%}", limits=(0, 1)),
@@ -248,20 +256,19 @@ def _cat_pipeline_results(
         kind="bar",
         sharey=True,
         legend=False,
-        col_order=["test", "train"],
-        height=10,
+        col_order=["test", "validation", "train"],
+        height=8,
         estimator=weighted_mean,
         orient="v",
     )
-    grid.set_xticklabels(rotation=30, ha="right")
 
     fig: Figure = grid.fig
 
+    grid.set_xticklabels(rotation=30, ha="right")
     _format_axes(fig.get_axes(), fmt, limits=limits, log_scale=log_scale)
 
-    fig.tight_layout()
-
     fig.suptitle(y)
+    fig.tight_layout()
 
     return fig
 
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
index 266de795dd14914537cade66ee3f0cd560aaf039..9f11ae38cb1071e8868ee9c13d38f5a3ec795782 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
@@ -1,5 +1,4 @@
 from enum import Enum
-from itertools import chain
 from time import time
 from typing import Generic, Iterable, List
 
@@ -20,25 +19,24 @@ from research.robots_at_robots.evaluation.set import Set
 
 class ImageClassificationPipelineEvaluator(Generic[TargetT]):
     def __init__(
-        self, train_datasets: List[FileImageDataset], test_datasets: List[FileImageDataset],
+        self,
+        train_datasets: List[FileImageDataset],
+        validation_datasets: List[FileImageDataset],
+        test_datasets: List[FileImageDataset],
     ):
-        self.train_datasets = train_datasets
-        self.test_datasets = test_datasets
+        self.set2datasets = {Set.TRAIN: train_datasets, Set.VALIDATION: validation_datasets, Set.TEST: test_datasets}
 
     def evaluate_pipelines(self, pipelines: Iterable[ClassificationPipeline]) -> ClassificationPerformances:
         return ClassificationPerformances(flatten(self._evaluate_pipeline(pipeline) for pipeline in pipelines))
 
     def _evaluate_pipeline(self, pipeline: ClassificationPipeline) -> Iterable[ContextualizedClassificationPerformance]:
-        return chain(
-            self._evaluate_pipeline_on_set(pipeline, self.train_datasets, Set.TRAIN),
-            self._evaluate_pipeline_on_set(pipeline, self.test_datasets, Set.TEST),
-        )
+        for set_ in Set:
+            yield from self._evaluate_pipeline_on_set(pipeline, set_)
 
-    @staticmethod
     def _evaluate_pipeline_on_set(
-        pipeline: ClassificationPipeline, datasets: List[FileImageDataset], set_: Set
+        self, pipeline: ClassificationPipeline, set_: Set
     ) -> Iterable[ContextualizedClassificationPerformance]:
-        for dataset in datasets:
+        for dataset in self.set2datasets[set_]:
             t = time()
             proba, classes = pipeline.predict_proba_and_classes(file_images_to_images(dataset.examples))
             mean_time = (time() - t) / len(dataset)
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/performance.py b/robots-at-robots/research/robots_at_robots/evaluation/performance.py
index 33c0bc765a301b1bbe3c956948fa3351052cce6b..52c014c9b6348a7ddf6a556843806c2f09b908cb 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/performance.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/performance.py
@@ -49,6 +49,10 @@ class ClassificationPerformances(Iterable[ContextualizedClassificationPerformanc
     def test(self) -> "ClassificationPerformances":
         return self.on_set(Set.TEST)
 
+    @property
+    def validation(self) -> "ClassificationPerformances":
+        return self.on_set(Set.VALIDATION)
+
     def on_set(self, set_: Set) -> "ClassificationPerformances":
         return ClassificationPerformances(SetClassificationPerformanceFilter(set_).filter(self.performances))
 
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/set.py b/robots-at-robots/research/robots_at_robots/evaluation/set.py
index 6175a68587d575e3d18cacf456ab99da45925220..0911a53bc040bf721a0cb9afa64235d2d416a3c8 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/set.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/set.py
@@ -8,6 +8,9 @@ class Set(Enum):
     VALIDATION = auto()
     TEST = auto()
 
+    def __hash__(self):
+        return hash(self.name)
+
     def __repr__(self):
         return self.name.lower()
 
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/trainer.py b/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
index 6731cd00fd39b312d7e878eecc55fbb4d85adfb6..6f5940bb0ab56abc2ce15c8b81f932e30971d1f2 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
@@ -10,13 +10,14 @@ from research.common.datasets.union_dataset import UnionDataset
 
 
 class ImageClassificationPipelineTrainer(Generic[TargetT]):
-    def __init__(self, training_datasets: List[FileImageDataset]):
-        train_dataset = UnionDataset(training_datasets)
-        self.images = file_images_to_images(train_dataset.examples)
-        self.labels = train_dataset.targets
+    def __init__(self, training_datasets: List[FileImageDataset], validation_datasets: List[FileImageDataset]):
+        dataset = UnionDataset(training_datasets + validation_datasets)
+        self.validation_size = sum(len(d) for d in validation_datasets)
+        self.images = file_images_to_images(dataset.examples)
+        self.labels = dataset.targets
 
     def train_pipeline(self, pipeline: ClassificationPipeline):
-        pipeline.fit(self.images, self.labels)
+        pipeline.fit(self.images, self.labels, validation_size=self.validation_size)
 
     def train_pipelines(self, pipelines: List[ClassificationPipeline]):
         tqdm_pipelines = tqdm(pipelines, desc="Training Pipelines")