diff --git a/common/research_common/image_pipeline_evaluation/image_dataset_generator.py b/common/research_common/image_pipeline_evaluation/image_dataset_generator.py
index b2713f6b3294c49841a9f516c39790841c72641e..c2848779ea75666d70dde2931dffa9ed096dce8b 100644
--- a/common/research_common/image_pipeline_evaluation/image_dataset_generator.py
+++ b/common/research_common/image_pipeline_evaluation/image_dataset_generator.py
@@ -1,4 +1,5 @@
 from abc import abstractmethod
+from pathlib import Path
 from typing import TypeVar, Generic, Tuple, List, Iterable
 
 from polystar.common.models.image import Image
@@ -8,16 +9,19 @@ T = TypeVar("T")
 
 
 class ImageDatasetGenerator(Generic[T]):
-    def from_roco_datasets(self, datasets: Iterable[DirectoryROCODataset]) -> Tuple[List[Image], List[T], List[int]]:
-        images, labels, dataset_sizes = [], [], []
+    def from_roco_datasets(
+        self, datasets: Iterable[DirectoryROCODataset]
+    ) -> Tuple[List[Path], List[Image], List[T], List[int]]:
+        images_path, images, labels, dataset_sizes = [], [], [], []
         for dataset in datasets:
             prev_total_size = len(images)
-            for img, label in self.from_roco_dataset(dataset):
-                images.append(img)
+            for img_path, label in self.from_roco_dataset(dataset):
+                images_path.append(img_path)
+                images.append(Image.from_path(img_path))
                 labels.append(label)
             dataset_sizes.append(len(images) - prev_total_size)
-        return images, labels, dataset_sizes
+        return images_path, images, labels, dataset_sizes
 
     @abstractmethod
-    def from_roco_dataset(self, dataset: DirectoryROCODataset) -> Iterable[Tuple[Image, T]]:
+    def from_roco_dataset(self, dataset: DirectoryROCODataset) -> Iterable[Tuple[Path, T]]:
         pass
diff --git a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py
index c8e3e0dbab0da59b79dbb0cf972ceb85e47fe433..336a970cca3d768371bdaf3de3d588acc1b2e3f7 100644
--- a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py
+++ b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py
@@ -1,5 +1,7 @@
 from collections import Counter
 from dataclasses import dataclass
+from os.path import relpath
+from pathlib import Path
 from typing import Iterable, List, Any, Dict, Tuple
 
 import numpy as np
@@ -9,7 +11,7 @@ from polystar.common.image_pipeline.image_pipeline import ImagePipeline
 from polystar.common.utils.dataframe import format_df_rows, format_df_row, format_df_column
 from polystar.common.utils.markdown import MarkdownFile
 from polystar.common.utils.time import create_time_id
-from research_common.constants import EVALUATION_DIR
+from research_common.constants import EVALUATION_DIR, DSET_DIR
 from research_common.dataset.roco_dataset import ROCODataset
 from research_common.image_pipeline_evaluation.image_pipeline_evaluator import (
     ImagePipelineEvaluator,
@@ -78,27 +80,47 @@ class ImagePipelineEvaluationReporter:
         for pipeline_name, results in pipeline2results.items():
             self._report_pipeline_results(mf, pipeline_name, results)
 
-    @staticmethod
-    def _report_pipeline_results(mf: MarkdownFile, pipeline_name: str, results: ClassificationResults):
+    def _report_pipeline_results(self, mf: MarkdownFile, pipeline_name: str, results: ClassificationResults):
         mf.title(pipeline_name, level=2)
 
         mf.paragraph(results.full_pipeline_name)
 
         mf.title("Train results", level=3)
-        ImagePipelineEvaluationReporter._report_pipeline_set_results(mf, results.train_results)
+        ImagePipelineEvaluationReporter._report_pipeline_set_results(
+            mf, results.train_results, self.evaluator.train_images_paths
+        )
 
         mf.title("Test results", level=3)
-        ImagePipelineEvaluationReporter._report_pipeline_set_results(mf, results.test_results)
+        ImagePipelineEvaluationReporter._report_pipeline_set_results(
+            mf, results.test_results, self.evaluator.test_images_paths
+        )
 
     @staticmethod
-    def _report_pipeline_set_results(mf: MarkdownFile, results: SetClassificationResults):
+    def _report_pipeline_set_results(mf: MarkdownFile, results: SetClassificationResults, image_paths: List[Path]):
+        mf.title("Metrics", level=4)
         mf.paragraph(f"Inference time: {results.mean_inference_time: .2e} s/img")
         df = DataFrame(results.report)
         format_df_rows(df, ["precision", "recall", "f1-score"], "{:.1%}")
         format_df_row(df, "support", int)
         mf.table(df)
-        mf.paragraph("Confusion Matrix:")
+        mf.title("Confusion Matrix:", level=4)
         mf.table(DataFrame(results.confusion_matrix))
+        mf.title("10 Mistakes examples", level=4)
+        mistakes_idx = np.random.choice(results.mistakes, min(len(results.mistakes), 10), replace=False)
+        relative_paths = [
+            f"![img]({relpath(str(image_paths[idx]), str(mf.markdown_path.parent))})" for idx in mistakes_idx
+        ]
+        images_names = [image_paths[idx].relative_to(DSET_DIR) for idx in mistakes_idx]
+        mf.table(
+            DataFrame(
+                {
+                    "images": relative_paths,
+                    "labels": results.labels[mistakes_idx],
+                    "predictions": results.predictions[mistakes_idx],
+                    "image names": images_names,
+                }
+            ).set_index("images")
+        )
 
     def _aggregate_results(self, pipeline2results: Dict[str, ClassificationResults]) -> DataFrame:
         main_metric_name = f"{self.main_metric[0]} {self.main_metric[1]}"
diff --git a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py
index 021508c0c3c9b5188ce4b5c79842b8058ac55c51..24fdfa63bfc5f21bde2a16cfcd9963405ee0f2e1 100644
--- a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py
+++ b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py
@@ -3,6 +3,7 @@ from dataclasses import dataclass
 from time import time
 from typing import List, Dict, Any, Iterable, Sequence
 
+import numpy as np
 from sklearn.metrics import classification_report, confusion_matrix
 
 from polystar.common.image_pipeline.image_pipeline import ImagePipeline
@@ -13,17 +14,21 @@ from research_common.image_pipeline_evaluation.image_dataset_generator import Im
 
 @dataclass
 class SetClassificationResults:
-    report: Dict
-    confusion_matrix: Dict
+    labels: np.ndarray
+    predictions: np.ndarray
     mean_inference_time: float
 
-    @classmethod
-    def from_labels_and_time(cls, labels: Sequence[Any], preds: Sequence[Any], mean_time: float):
-        return cls(
-            report=classification_report(labels, preds, output_dict=True),
-            confusion_matrix=confusion_matrix(labels, preds),
-            mean_inference_time=mean_time,
-        )
+    @property
+    def report(self) -> Dict:
+        return classification_report(self.labels, self.predictions, output_dict=True)
+
+    @property
+    def confusion_matrix(self) -> Dict:
+        return confusion_matrix(self.labels, self.predictions)
+
+    @property
+    def mistakes(self) -> Sequence[int]:
+        return np.where(self.labels != self.predictions)[0]
 
 
 @dataclass
@@ -43,12 +48,18 @@ class ImagePipelineEvaluator:
         logging.info("Loading data")
         self.train_roco_datasets = train_roco_datasets
         self.test_roco_datasets = test_roco_datasets
-        self.train_images, self.train_labels, self.train_dataset_sizes = image_dataset_generator.from_roco_datasets(
-            train_roco_datasets
-        )
-        self.test_images, self.test_labels, self.test_dataset_sizes = image_dataset_generator.from_roco_datasets(
-            test_roco_datasets
-        )
+        (
+            self.train_images_paths,
+            self.train_images,
+            self.train_labels,
+            self.train_dataset_sizes,
+        ) = image_dataset_generator.from_roco_datasets(train_roco_datasets)
+        (
+            self.test_images_paths,
+            self.test_images,
+            self.test_labels,
+            self.test_dataset_sizes,
+        ) = image_dataset_generator.from_roco_datasets(test_roco_datasets)
 
     def evaluate_pipelines(self, pipelines: Iterable[ImagePipeline]) -> Dict[str, ClassificationResults]:
         return {str(pipeline): self.evaluate(pipeline) for pipeline in pipelines}
@@ -70,4 +81,4 @@ class ImagePipelineEvaluator:
         t = time()
         preds = pipeline.predict(images)
         mean_time = (time() - t) / len(images)
-        return SetClassificationResults.from_labels_and_time(labels, preds, mean_time)
+        return SetClassificationResults(np.asarray(labels), np.asarray(preds), mean_time)
diff --git a/robots-at-robots/research/dataset/armor_image_dataset_factory.py b/robots-at-robots/research/dataset/armor_image_dataset_factory.py
index b211c633d02325a744275d0640f8acaa1ab9a7af..dfdf35b8ec6ea55e0cffc1c6f9458a69e7190109 100644
--- a/robots-at-robots/research/dataset/armor_image_dataset_factory.py
+++ b/robots-at-robots/research/dataset/armor_image_dataset_factory.py
@@ -21,7 +21,7 @@ class ArmorImageDatasetGenerator(ImageDatasetGenerator[T]):
     def from_roco_dataset(self, dataset: DirectoryROCODataset) -> Iterable[Tuple[Image, T]]:
         if not (dataset.dataset_path / self.task_name / ".lock").exists():
             self._create_labelized_armor_images_from_roco(dataset)
-        return self._get_saved_images_and_labels(dataset)
+        return self._get_images_paths_and_labels(dataset)
 
     def _create_labelized_armor_images_from_roco(self, dataset):
         dset_path = dataset.dataset_path / self.task_name
@@ -33,9 +33,9 @@ class ArmorImageDatasetGenerator(ImageDatasetGenerator[T]):
             json.dumps({"version": "0.0", "date": create_time_id()})
         )
 
-    def _get_saved_images_and_labels(self, dataset: DirectoryROCODataset) -> Iterable[Tuple[Image, T]]:
+    def _get_images_paths_and_labels(self, dataset: DirectoryROCODataset) -> Iterable[Tuple[Image, T]]:
         return (
-            (Image.from_path(image_path), self._label_from_filepath(image_path))
+            (image_path, self._label_from_filepath(image_path))
             for image_path in (dataset.dataset_path / self.task_name).glob("*.jpg")
             if self._valid_label(self._label_from_filepath(image_path))
         )