diff --git a/common/polystar/common/image_pipeline/models/red_blue_channels_comparison_model.py b/common/polystar/common/image_pipeline/models/red_blue_channels_comparison_model.py
index 78cb00e5d518a742604e90fe5a68fa27456c4154..d7b2416be281d5da5f657334ecb9929baa8a4e8f 100644
--- a/common/polystar/common/image_pipeline/models/red_blue_channels_comparison_model.py
+++ b/common/polystar/common/image_pipeline/models/red_blue_channels_comparison_model.py
@@ -14,7 +14,7 @@ class RedBlueComparisonModel(AbsoluteClassifierModelABC):
     blue_channel_id: int = 2
 
     def __post_init__(self):
-        self.labels_ = np.asarray(sorted(["Red", "Grey", "Blue"]))
+        self.labels_ = np.asarray(sorted(["red", "grey", "blue"]))
         self.label2index_ = {label: i for i, label in enumerate(self.labels_)}
 
     def fit(self, features: List[Any], labels: List[Any]) -> "RedBlueComparisonModel":
@@ -22,7 +22,7 @@ class RedBlueComparisonModel(AbsoluteClassifierModelABC):
 
     def predict(self, features: List[Tuple[float, float, float]]) -> List[str]:
         return [
-            "Red" if feature[self.red_channel_id] >= feature[self.blue_channel_id] else "Blue" for feature in features
+            "red" if feature[self.red_channel_id] >= feature[self.blue_channel_id] else "blue" for feature in features
         ]
 
     def __str__(self) -> str:
diff --git a/common/polystar/common/utils/dataframe.py b/common/polystar/common/utils/dataframe.py
index 6d25f928076a4a13190655ae751df241ec2b6103..814915c9804998cd5cadcc31b7bb64bd4cfbbae2 100644
--- a/common/polystar/common/utils/dataframe.py
+++ b/common/polystar/common/utils/dataframe.py
@@ -1,27 +1,29 @@
-from typing import Any, Iterable, Callable, Union
+from typing import Any, Callable, Iterable, Union
 
 from pandas import DataFrame
 
+Format = Union[str, Callable]
 
-def format_df_column(df: DataFrame, column_name: str, fmt: Union[Callable, str]):
+
+def format_df_column(df: DataFrame, column_name: str, fmt: Format):
     df[column_name] = df[column_name].map(fmt.format)
 
 
-def format_df_columns(df: DataFrame, column_names: Iterable[str], fmt: Union[Callable, str]):
+def format_df_columns(df: DataFrame, column_names: Iterable[str], fmt: Format):
     for c in column_names:
         format_df_column(df, c, fmt)
 
 
-def format_df_row(df: DataFrame, loc: Any, fmt: Union[Callable, str]):
-    df.loc[loc] = df.loc[loc].map(_make_formater(fmt))
+def format_df_row(df: DataFrame, loc: Any, fmt: Format):
+    df.loc[loc] = df.loc[loc].map(make_formater(fmt))
 
 
-def format_df_rows(df: DataFrame, locs: Iterable[Any], fmt: Union[Callable, str]):
+def format_df_rows(df: DataFrame, locs: Iterable[Any], fmt: Format):
     for loc in locs:
         format_df_row(df, loc, fmt)
 
 
-def _make_formater(fmt: Union[Callable, str]) -> Callable:
+def make_formater(fmt: Format) -> Callable:
     if isinstance(fmt, str):
         return fmt.format
     return fmt
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
index dadcd948489e85bd768b607e767142fa6718b83b..b65d0746a989d70a3cf5cc050f8be4998b0db940 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
@@ -1,13 +1,17 @@
 from collections import Counter
 from dataclasses import dataclass
+from math import log
 from os.path import relpath
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Tuple, Generic
+from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple
 
+import matplotlib.pyplot as plt
 import numpy as np
+from matplotlib.axes import Axes
 from pandas import DataFrame
+
 from polystar.common.image_pipeline.image_pipeline import ImagePipeline
-from polystar.common.utils.dataframe import format_df_column, format_df_row, format_df_rows
+from polystar.common.utils.dataframe import Format, format_df_column, format_df_row, format_df_rows, make_formater
 from polystar.common.utils.markdown import MarkdownFile
 from polystar.common.utils.time import create_time_id
 from research.common.constants import DSET_DIR, EVALUATION_DIR
@@ -30,15 +34,12 @@ class ImagePipelineEvaluationReporter(Generic[ValueT]):
 
         pipeline2results = self.evaluator.evaluate_pipelines(pipelines)
 
-        with MarkdownFile(
-            EVALUATION_DIR / self.evaluation_project / f"{evaluation_short_name}_{create_time_id()}.md"
-        ) as mf:
-            mf.title(f"Evaluation report {evaluation_short_name}")
+        report_dir = EVALUATION_DIR / self.evaluation_project / f"{evaluation_short_name}_{create_time_id()}"
 
+        with MarkdownFile(report_dir / "report.md") as mf:
+            mf.title(f"Evaluation report {evaluation_short_name}")
             self._report_datasets(mf)
-
-            self._report_aggregated_results(mf, pipeline2results)
-
+            self._report_aggregated_results(mf, pipeline2results, report_dir)
             self._report_pipelines_results(mf, pipeline2results)
 
     def _report_datasets(self, mf: MarkdownFile):
@@ -70,11 +71,22 @@ class ImagePipelineEvaluationReporter(Generic[ValueT]):
         df["Repartition"] = (df["Total"] / total).map("{:.1%}".format)
         mf.table(df)
 
-    def _report_aggregated_results(self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[ValueT]]):
-        aggregated_results = self._aggregate_results(pipeline2results)
+    def _report_aggregated_results(
+        self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[ValueT]], report_dir: Path
+    ):
+        fig, (ax_test, ax_train) = plt.subplots(1, 2, figsize=(16, 5))
+        aggregated_test_results = self._aggregate_results(pipeline2results, ax_test, "test")
+        aggregated_train_results = self._aggregate_results(pipeline2results, ax_train, "train")
+        fig.tight_layout()
+        aggregated_image_name = "aggregated_test_results.png"
+        fig.savefig(report_dir / aggregated_image_name, transparent=True)
+
         mf.title("Aggregated results", level=2)
+        mf.image(aggregated_image_name)
         mf.paragraph("On test set:")
-        mf.table(aggregated_results)
+        mf.table(aggregated_test_results)
+        mf.paragraph("On train set:")
+        mf.table(aggregated_train_results)
 
     def _report_pipelines_results(self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[ValueT]]):
         for pipeline_name, results in pipeline2results.items():
@@ -124,19 +136,90 @@ class ImagePipelineEvaluationReporter(Generic[ValueT]):
             ).set_index("images")
         )
 
-    def _aggregate_results(self, pipeline2results: Dict[str, ClassificationResults[ValueT]]) -> DataFrame:
+    def _aggregate_results(
+        self, pipeline2results: Dict[str, ClassificationResults[ValueT]], ax: Axes, set_: str
+    ) -> DataFrame:
         main_metric_name = f"{self.main_metric[0]} {self.main_metric[1]}"
-        df = DataFrame(columns=["pipeline", main_metric_name, "inf time"]).set_index("pipeline")
-
-        for pipeline_name, results in pipeline2results.items():
-            df.loc[pipeline_name] = [
-                results.test_results.report[self.main_metric[1]][self.main_metric[0]],
-                results.test_results.mean_inference_time,
-            ]
+        df = (
+            DataFrame.from_records(
+                [
+                    (
+                        pipeline_name,
+                        results.on_set(set_).report[self.main_metric[1]][self.main_metric[0]],
+                        results.on_set(set_).mean_inference_time,
+                    )
+                    for pipeline_name, results in pipeline2results.items()
+                ],
+                columns=["pipeline", main_metric_name, "inf time"],
+            )
+            .set_index("pipeline")
+            .sort_values(main_metric_name, ascending=False)
+        )
 
-        df = df.sort_values(main_metric_name, ascending=False)
+        bar_plot_with_secondary(df, set_.title(), fmt_y1="{:.1%}", fmt_y2="{:.1e}", y2_log=True, ax=ax)
 
         format_df_column(df, main_metric_name, "{:.1%}")
         format_df_column(df, "inf time", "{:.2e}")
 
         return df
+
+
+def bar_plot_with_secondary(
+    df: DataFrame,
+    title: str,
+    fmt_y1: Format = str,
+    fmt_y2: Format = str,
+    y1_log: bool = False,
+    y2_log: bool = False,
+    limits_y1: Tuple[float, float] = None,
+    limits_y2: Tuple[float, float] = None,
+    ax: Axes = None,
+):
+    if ax is None:
+        (_, ax) = plt.subplots()
+
+    y1, y2 = df.columns
+
+    df.plot.bar(rot=0, ax=ax, secondary_y=y2, legend=False, title=title)
+
+    ax1, ax2 = ax, plt.gcf().get_axes()[-1]
+
+    _format_ax(ax1, y1, fmt_y1, y1_log, limits_y1)
+    _format_ax(ax2, y2, fmt_y2, y2_log, limits_y2)
+
+    _legend_with_secondary(ax1, ax2)
+
+
+def _legend_with_secondary(ax1: Axes, ax2: Axes):
+    lines_1, labels_1 = ax1.get_legend_handles_labels()
+    lines_2, labels_2 = ax2.get_legend_handles_labels()
+    lines = lines_1 + lines_2
+    labels = labels_1 + labels_2
+    ax1.legend(lines, labels, loc=0)
+
+
+def _format_ax(ax: Axes, label: str, fmt: Format, log_scale: bool, limits: Optional[Tuple[float, float]]):
+    ax.set_ylabel(label)
+
+    if limits:
+        ax.set_ylim(*limits)
+
+    if log_scale:
+        ax.set_yscale("log")
+
+    m, _ = ax.get_ylim()
+
+    fmt = make_formater(fmt)
+
+    for p in ax.patches:
+        if log_scale:
+            h = pow(10, 0.5 * (log(p.get_height(), 10) + log(m, 10)))
+        else:
+            h = 0.6 * p.get_height()
+        ax.annotate(
+            fmt(p.get_height()),
+            (p.get_x() + p.get_width() / 2.0, h),
+            ha="center",
+            va="center",
+            textcoords="offset points",
+        )
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
index 27afedbd8bd5d4c47eebffb2e66b7839e1dd6f75..30e3092511998a9c310b2682672ec7b8227b5cfb 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
@@ -8,10 +8,8 @@ import numpy as np
 from memoized_property import memoized_property
 from polystar.common.image_pipeline.image_pipeline import ImagePipeline
 from polystar.common.models.image import Image, load_images
-from research.common.datasets.roco.directory_roco_dataset import \
-    DirectoryROCODataset
-from research.robots_at_robots.dataset.armor_value_dataset import (
-    ArmorValueDatasetCache, ValueT)
+from research.common.datasets.roco.directory_roco_dataset import DirectoryROCODataset
+from research.robots_at_robots.dataset.armor_value_dataset import ArmorValueDatasetCache, ValueT
 from sklearn.metrics import classification_report, confusion_matrix
 
 
@@ -44,6 +42,11 @@ class ClassificationResults(Generic[ValueT]):
     test_results: SetClassificationResults[ValueT]
     full_pipeline_name: str
 
+    def on_set(self, set_: str) -> SetClassificationResults[ValueT]:
+        if set_ is "train":
+            return self.train_results
+        return self.test_results
+
 
 class ImagePipelineEvaluator(Generic[ValueT]):
     def __init__(