From 01996ef7b29ac29817439081924cb65c7fe05495 Mon Sep 17 00:00:00 2001
From: Mathieu Beligon <mathieu@feedly.com>
Date: Mon, 14 Dec 2020 20:29:37 +0100
Subject: [PATCH] [robots@robots] (evaluation) refactor the evaluation
 framework

---
 .../image_pipeline/preprocessors/normalise.py |   7 +
 .../image_pipeline/preprocessors/resize.py    |  14 +
 common/polystar/common/models/image.py        |   6 +-
 .../classification/classification_pipeline.py |   9 +-
 .../polystar/common/utils/iterable_utils.py   |  21 +-
 common/polystar/common/utils/markdown.py      |   9 +-
 .../research/common/datasets/image_dataset.py |   5 +-
 .../digits/.changes                           | Bin 31330 -> 75097 bytes
 .../armor_color/armor_color_benchmarker.py    |  20 ++
 .../armor_color_pipeline_reporter_factory.py  |  24 --
 .../{baseline_experiments.py => benchmark.py} |  14 +-
 .../armor_digit/armor_digit_benchmarker.py    |  20 ++
 .../armor_digit_pipeline_reporter_factory.py  |  24 --
 .../robots_at_robots/armor_digit/benchmark.py |  57 ++--
 .../armor_digit/clean_datasets.py             |  13 +-
 .../dataset/armor_value_dataset_generator.py  |   8 +-
 .../robots_at_robots/demos/demo_pipeline.py   |   2 +-
 .../robots_at_robots/evaluation/benchmark.py  |  49 +++
 .../image_pipeline_evaluation_reporter.py     | 306 ++++++++++--------
 .../evaluation/image_pipeline_evaluator.py    | 130 +++-----
 .../evaluation/metrics/__init__.py            |   0
 .../evaluation/metrics/accuracy.py            |  11 +
 .../robots_at_robots/evaluation/metrics/f1.py |  30 ++
 .../evaluation/metrics/metric_abc.py          |  17 +
 .../evaluation/performance.py                 |  79 +++++
 .../robots_at_robots/evaluation/set.py        |  14 +
 .../robots_at_robots/evaluation/trainer.py    |  25 ++
 27 files changed, 590 insertions(+), 324 deletions(-)
 create mode 100644 common/polystar/common/image_pipeline/preprocessors/normalise.py
 create mode 100644 common/polystar/common/image_pipeline/preprocessors/resize.py
 create mode 100644 robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
 delete mode 100644 robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py
 rename robots-at-robots/research/robots_at_robots/armor_color/{baseline_experiments.py => benchmark.py} (85%)
 create mode 100644 robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
 delete mode 100644 robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/metrics/__init__.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/performance.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/set.py
 create mode 100644 robots-at-robots/research/robots_at_robots/evaluation/trainer.py

diff --git a/common/polystar/common/image_pipeline/preprocessors/normalise.py b/common/polystar/common/image_pipeline/preprocessors/normalise.py
new file mode 100644
index 0000000..a00c8d0
--- /dev/null
+++ b/common/polystar/common/image_pipeline/preprocessors/normalise.py
@@ -0,0 +1,7 @@
+from polystar.common.models.image import Image
+from polystar.common.pipeline.pipe_abc import PipeABC
+
+
+class Normalise(PipeABC):
+    def transform_single(self, image: Image) -> Image:
+        return image / 255
diff --git a/common/polystar/common/image_pipeline/preprocessors/resize.py b/common/polystar/common/image_pipeline/preprocessors/resize.py
new file mode 100644
index 0000000..6afbc2b
--- /dev/null
+++ b/common/polystar/common/image_pipeline/preprocessors/resize.py
@@ -0,0 +1,14 @@
+from typing import Tuple
+
+from cv2.cv2 import resize
+
+from polystar.common.models.image import Image
+from polystar.common.pipeline.pipe_abc import PipeABC
+
+
+class Resize(PipeABC):
+    def __init__(self, size: Tuple[int, int]):
+        self.size = size
+
+    def transform_single(self, image: Image) -> Image:
+        return resize(image, self.size)
diff --git a/common/polystar/common/models/image.py b/common/polystar/common/models/image.py
index 4d598f5..29a0b13 100644
--- a/common/polystar/common/models/image.py
+++ b/common/polystar/common/models/image.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, List
 
 import cv2
 import numpy as np
@@ -38,3 +38,7 @@ def load_images_in_directory(
 def save_image(image: Image, image_path: Path, conversion: int = cv2.COLOR_RGB2BGR):
     image_path.parent.mkdir(exist_ok=True, parents=True)
     cv2.imwrite(str(image_path), cv2.cvtColor(image, conversion))
+
+
+def file_images_to_images(file_images: Iterable[FileImage]) -> List[Image]:
+    return [np.asarray(file_image) for file_image in file_images]
diff --git a/common/polystar/common/pipeline/classification/classification_pipeline.py b/common/polystar/common/pipeline/classification/classification_pipeline.py
index 074cc15..99c85f5 100644
--- a/common/polystar/common/pipeline/classification/classification_pipeline.py
+++ b/common/polystar/common/pipeline/classification/classification_pipeline.py
@@ -2,7 +2,7 @@ from abc import ABC
 from enum import IntEnum
 from typing import ClassVar, Generic, List, Sequence, Tuple, TypeVar
 
-from numpy import asarray, ndarray
+from numpy import asarray, ndarray, pad
 
 from polystar.common.pipeline.classification.classifier_abc import ClassifierABC
 from polystar.common.pipeline.pipe_abc import IT, PipeABC
@@ -29,6 +29,13 @@ class ClassificationPipeline(Pipeline, Generic[IT, EnumT], ABC):
     def predict(self, x: Sequence[IT]) -> List[EnumT]:
         return self.predict_proba_and_classes(x)[1]
 
+    def predict_proba(self, x: Sequence[IT]) -> ndarray:
+        proba = super().predict_proba(x)
+        missing_classes = self.classifier.n_classes - proba.shape[1]
+        if not missing_classes:
+            return proba
+        return pad(proba, ((0, 0), (0, missing_classes)))
+
     def predict_proba_and_classes(self, x: Sequence[IT]) -> Tuple[ndarray, List[EnumT]]:
         proba = asarray(self.predict_proba(x))
         indices = proba.argmax(axis=1)
diff --git a/common/polystar/common/utils/iterable_utils.py b/common/polystar/common/utils/iterable_utils.py
index a004688..01bc2da 100644
--- a/common/polystar/common/utils/iterable_utils.py
+++ b/common/polystar/common/utils/iterable_utils.py
@@ -1,4 +1,6 @@
-from typing import Iterable
+from collections import defaultdict
+from itertools import chain
+from typing import Callable, Dict, Iterable, List, TypeVar
 
 from more_itertools import ilen
 
@@ -8,3 +10,20 @@ def smart_len(it: Iterable) -> int:
         return len(it)
     except AttributeError:
         return ilen(it)
+
+
+T = TypeVar("T")
+
+
+def flatten(it: Iterable[Iterable[T]]) -> List[T]:
+    return list(chain.from_iterable(it))
+
+
+U = TypeVar("U")
+
+
+def group_by(it: Iterable[T], key: Callable[[T], U]) -> Dict[U, List[T]]:
+    rv = defaultdict(list)
+    for item in it:
+        rv[key(item)].append(item)
+    return rv
diff --git a/common/polystar/common/utils/markdown.py b/common/polystar/common/utils/markdown.py
index 79a9d83..3997375 100644
--- a/common/polystar/common/utils/markdown.py
+++ b/common/polystar/common/utils/markdown.py
@@ -1,6 +1,7 @@
 from pathlib import Path
-from typing import TextIO, Iterable, Any
+from typing import Any, Iterable, TextIO
 
+from matplotlib.figure import Figure
 from pandas import DataFrame
 from tabulate import tabulate
 
@@ -35,7 +36,11 @@ class MarkdownFile:
         self.paragraph(f"![{alt}]({relative_path})")
         return self
 
+    def figure(self, figure: Figure, name: str, alt: str = "img"):
+        figure.savefig(self.markdown_path.parent / name)
+        return self.image(name, alt)
+
     def table(self, data: DataFrame) -> "MarkdownFile":
-        self.file.write(tabulate(data, tablefmt="pipe", headers="keys"))
+        self.file.write(tabulate(data, tablefmt="pipe", headers="keys").replace(".0 ", "   "))
         self.file.write("\n\n")
         return self
diff --git a/common/research/common/datasets/image_dataset.py b/common/research/common/datasets/image_dataset.py
index 13bb5a5..9378439 100644
--- a/common/research/common/datasets/image_dataset.py
+++ b/common/research/common/datasets/image_dataset.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from polystar.common.models.image import Image
+from polystar.common.models.image import FileImage, Image
 from research.common.datasets.dataset import Dataset
 from research.common.datasets.lazy_dataset import LazyDataset, TargetT
 
@@ -9,3 +9,6 @@ FileDataset = Dataset[Path, TargetT]
 
 LazyImageDataset = LazyDataset[Image, TargetT]
 ImageDataset = Dataset[Image, TargetT]
+
+LazyFileImageDataset = LazyDataset[FileImage, TargetT]
+FileImageDataset = Dataset[FileImage, TargetT]
diff --git a/dataset/dji_roco/robomaster_Final Tournament/digits/.changes b/dataset/dji_roco/robomaster_Final Tournament/digits/.changes
index 35d75bb3ff7e3bcf5ffc930deef3313e092ba0fe..094f578582ac0df9123e392b0be3aec31d5a695b 100644
GIT binary patch
literal 75097
zcmb`QO^+nUbw&636$Do{;5NFlsz1t08icKdErAwBfGndygETz`!7)gVHiG`U&y8YB
z0;?i3?`Z>rSv2z2i+J(A<K@dg{^FBQK7Ie)AHMwd{a0W9`1Mzx{=+AK<WKx@xnJ%N
z%i;dz@U%QF%O`*HFJJ!n-|xTv_{nEqfBXKM58wSQkN*33l!xE__T?Y{`S|(2{^s*9
ze*MKSfA{&jAAa-Y$L~M>@bcNOmY2iJ;r{;aaCi9o|M}l<KK;7_bE+|iyK{|M?hgIp
z0P(Nf(eds8BWe(^8uL(NPIr$r1}X+~zFW!-J>2z7U^{omlFs8@&A8{gQ%PpIH<0t)
zsoKl}$65w}S?(TMGN%@TUbk#sPIpf&8EC-iZZ}{KJ*GYzWOHey=J5_u>vr=L4wUEl
zj0kk<xjdsa<;vh#4`Dmq{?D+igB;_jI;#;#>kE$9&6W(1*4jd*W&m<op3D)O(Xtui
z)LQf->nepANbA&&Y~|b@4T4!8Czw{Aj~wRG9u5IfEjDM2ZKpT*z>;a`e~UoT+O3?>
zOq_aUHcmr0I~#byryW~5$mVm)gU4D4PDoA70x+%Rc(}VS9mm6cZJr(wfYzlwAOo#>
zK8WZknAQ+JAUADJ5KQZE9+0(GLm!?v(asD!)-v#jrC~pnf`VDvC=JY0scVmDK<f*G
zX<g2vFwtNL)GDq$<#0MiKRv^=lFBnFco#rDvH$xX)4HhV-r775Yt!)z4B2XRGS6sp
z8w~~18k=WfY@U-3l-t3nm4XJ9)(8M;J=8OWhf<u+s7$&2K#+k-0)lD72QaM)oDV>{
zOzn)2wPxdt1?;#+l@JS1F>OFP>l8?s)`)X3Lp_@V4d~SRsDNNx=h*|QJ!n9+p8|x>
zt)yo_IyV)_eObf}218vl4iJ@#LQg|znB7u<0(xwvE}+i9WqBCh*9zV!wExGY-u_qR
z{PSUXcz8oOKU9VvX*ma|KOzcCl`F~Wk|H%e8w^HHKx!NJP=*Gq?3oHiW3iVPLSyJu
zV!bRq6?j~-j2dp2hq4zArYrh@8L3Vu0ARwlj%nE4meJG-yGVC=0n*icfE>!Q7m$?w
zy8RzV+R|ZCj%l?9%(-0J39)I9Gng*YU0#w^w{#vb>Rkn6d3i+DTATKWD`?mD2$7Zt
zmn^wcH!$6m2`orWvJ<wa1!b!tUFx#DJOL;T;mE|xg*_3gbYTaHXqW9QFF;yr`<$wL
z=b)Y+*h>lF>`Nw-AGW6tq!mk2%GN)Vx4njF`<fw^7a%niPQR<iLMDEr6EYyJaXv?3
zy5k2@Q|X0_RJSW#!7&^_>(GN~jm%l(cwkzK38YnkbILHCEjtr9wB9Tj_;xEm`{frO
z{`13+?|%6F4<Ek$_~ZKz-~H5eG_Bms0h0>(95A`Ungb@K!5QE%3m4~rNq98{q#32+
zYicMHnV%Dh&AVAhUV}`;cMe&Q_;N4PoMcWV7hN~4<Q$UTjdJ_Z-V!n`)y@|d_jOZy
zr;wBoOT@HpG$%3*-PRzJJZTO&&O(GW2vPM##%>i@4y519)BUb~$vF`;^3`|Gh#b$S
z)1E5M8j(pK-W-w}r1I<&xZQO2b1Ku2Y6|J4B{^khDTt;@_i&gu`ptJTjr8Uavb@q#
zQ0pk2=^^ceuQK_wuywvNFq24Uju|Ovc{Hb4ax^Cc%St0oEou^k&#B;!Cf5yXn8_W@
z9Fp6!PRT~kUgyMsw0jvc(=CiDRH>L0$R<halwm+xw+CcWjh|CVi&N*q(|y*O*kIbA
z56mP5pYH~X@#^{Z8US>;JsKLRT8Ba9u7;q=rN*2dYxNh031rf8GCdf&s3_WXS{BTt
zc0H%V=}!t5Ynavnq+6@a6@qCMI+!*FNQYM&`-7R})N`)&{_G)1N#`|zz=9^DK2Yzj
zsgT9AR1!0Gc9WudP32tg1q;|3XQZ;+{bRE0?pT1dmYExcmSI5J02oMFAvpkPYb3ep
zXe|bqdjE0fUC6|J%A}w=)&2BCmc~4g$xY81nKu4~OfAb8*m^f$T6f0f5Er<WDVyIX
zO?d{JTlELhMvXm}xq)h(5|BwjeZKpMe`lATMl{nE48XKm7T0NQHv2>-({5(kKs;wz
zF73+g=UuAK$jtAPrwaqpie=W-oj(1xb@B_QJ$)c;zMQt+n#;J0)?=hcw+&0d)JsE@
zTWRLncJ`XnTxv!-R|-Mz0qIh11LkM+zcm5DOs>@D8~`)9BAjFJ0c{feJiYColR=@0
z=_(&MTBy6kk^63Bt+Q!B;6b%AoKuZx9onVmy0f+8BEEzq(yk%(_R;6oJ?A3+Rcmft
zGb}In_lMKz=}p%Rxw&rb=E`mNRQ`i$V{I_)wH(*o@VPwBw3Sry&&5F-0|O~#21Kbc
z3pc-I1!$oUzAVly_iUt0fw|BqGoaBWTiHIEWZS3#NbXG<)0gKv`x-=h^d$0~ub_9g
zJrc}OdBF&gR=hxz)@4kR=nfw4mWP<qTID2ZKyLYq=`M3?h?Eta1*=+yqYJjK+zNLq
z<po9%f=GFcp;GQ1NUQPlSr9X42ckSj5M>NXLwI`=5lCt411Y_BAmu$IkaqWVNrh1p
zsAClvjDZ%FB?3{VM#tFgZdKUrGF*p9d7B2Jm6Rls9kc2_EpA2IRShHD@wdJbVC+YE
zSxbc4x%W(0s=|~Lrm>7ufV95)GzvA<sRJoXTX`p?6ki~vFlMGm+0VyKc)Jq<mC^=X
z-g>D*l|af803%4s2=!Dqfq`h*b#C*r`jWMd9OolZX2AdxQA+CoqVyuUD@TBvJSLE`
zAOWOw(;&*|0YsUL=gyzKRC@<f770S7G%10!C!br-szwNrGT6#=n)cv%zU)SUO1qP~
zsR9~hfrOR{(vptBKw9wv(sCF`%V8kgMOJd3ySpkFf??$i*=RxOILy^>xhgcVM!Z&1
zclO3TV1`-=#%h;_sIH^4tkO4xyOniJ5u!Rk+FLMnjUa+3Jqd_XKSpm$x2n498X9F>
z4x;pLk0?iZ9!O!?2=Pc>RUTn<uw!rC2JrY;vzM#pa#hn;>Sh5QC*F}r$4F&$>rMvV
zq;6}Cf#5+Jtp!q=5)fs=p0~Bj?xH}-d<V$ux^7>rF+%0}e%BP1sg%^g$)y{oN~VFd
zqLp`OI~wr^ot#W3Ay1iAH=#254jb2P1V}3<#P8+ar?03__g+U0%;PJAnY?OW9pQ9#
z+iVy06>7SV${2vQpz_GmNK^)KK*|o1OzrC=@iOPHatVQyr6ZbDP?3%bkn&;-DrH_w
zR||UBtqVxmKpHA#^8k=muSSy4Z3ReaPt(&=nxR0-<Zw*NsEkD2pwFT@GX$VCT$vzQ
zZURARqM*7cdFf;7Xbj`nLP~SWy4tM|b6PoF=8v?pJ4TjtdnX7g4-rK#Z4hHY%6OUf
zr53?~v;v=Du2s5Di&R-M%y5q~&jnIC5)f6pC$smuu7r#pEF(Z5n3Kwh4oKxj2BKmw
z2;{wrg)WdPPPstZGr>_+2M?99pBO~>W*i7kq}}{55rwg1vSC-QV5%PmqR`#trMw4b
z0$yoQf~ac5On&Qf&;Uvbv7;56reo1>r!UNj!iZK!U8Z26P+czvFbQ8XPU)Y%fB)TA
zUw{AZhu?oZef{O>_aD#ybw2<3Yi?=mySbYk&`B`AjhyFmyReh$?k?=ao$bL+?Cv%+
z4O=&5avPc^p_@6wHWceRDPiowPHs)Mv6zub_PLG4j!r6i+t?(RH$%Z~Xi7KofTnrM
zHdUaLUX?xA*P`Ul?x;4Q!~&DBaSv3l7_yUWYY$m#X%fXrp|}KqPN0Id5*6&^CAe))
zjX{w&?|HYe9M2>(+J)8oh3=&FVGms`8L_PLbVuCkv6@rR)sg{r(g3l|Wr_v2n+Ntl
zCofHIL-8Z_fPiXallp-31tboWxNUp0xmlDFHH{WGslua4-{&?IA7YP(*LCb>X*#u$
zZd&;#+2;0c!P@8#x>hQf(&d>9c2eZq-Y&=UdbqdmJQpS-F1zkLH;Iitc&Bie30p3E
z3D#P*U~O1_BuJf9W42iZc9ITnW0BvtV6F35`Of9`nvp{rtG*%o!9Pn|Ok2E7x#)Gb
z;&^F#cisc6HEfqQc$b{!JC-)+1FM%jEyZlg;xHG^)J*5|Z4=E14R^5V*|xE9+J0?%
zNl&LW6?~(T3`FwSXaA7ZD;{?+F5k+xLu~pVtS#GRyqa}D!P@!^SgW?_ueH`9Sk0>3
z-@4O%gO>+7>2cn)7eFVkY43nqF41l~>HFR$N|dR|PCIb-kk#ipawcb^=+5=KoqjlJ
zd$(BgNsI3;tkpdFgf*AnqCV*;luC<%vZj%VdYUZOobcjSW6AabrI}9d7R#u~g0-b6
zruEq+b3QUY7$@tpOrSOx0&2OGzHTQH>2cO<LLjWZrzhDObH@JZ0!y%HkWwF7gD2O;
zo0>Pir|ld;K9X$<Z^2Hwk+zqN-pSe~43;bgX|EN4+FCh6EG@<KBb}S<rA=A0cRIgm
zn`#ON&a#dEW#>Hj?wc+z25V~_^xfKO>dLsgZ6>32Hk+P)>~NFnw0(3NiMHBAjh;%$
z;=J>t>OpyJ%>k#~oLel)mt~zr3sM#c10@<at}0M@l?GIXrWqLpUA11IGR%XjECYd*
zp&Ce89S3RmiGQ<#G+d)Q7@*RW<vz+v5>U^k3`A5ynDEYWNyguqnI^;UKqv3$ZJINn
zvb7DW+)=7XHc%O71#0b?uHr*fW+;TP&O>GH+#8_sOra`+LXa|L&FnjAeFK#?iFt6g
zoE@kXL7*~C&TKhLG@S%WPZ`l>lSZJLzd&W03so6+<ZfO@p>Jpj0P5E(qitx)UYPVC
zSV=;5UA75Pt1`J#RO!JHA1;dz^es%@2HtYTAf-<PDT7duui?t9VduC!9QREB-rOFM
zhAfgFalTS`l0-WZn7%5uwNaWmChrk%BDIo?57Vh)&gp439CUNF)2Sw#$dO8@_?iiS
zHWLn1M)j9Z<*5Y+Dq}~W)`bQtlL?^mE;y|tllK_6BqC7heJ>3w?raEeELFUjOfK`y
zJ?SE7lEDG1lr~VAgQven*1Q6>ItTO(3$`+hX_B#Lnrmdx5U6DnR!trb4Hz1lR%|7Q
zX=ael9H`WrG2hHCSB6&O>w=Evk%(6rG`=-RNk+vxfyy|Wh7B9p1u7GuK#_e@-UCVk
zuQFoB#p%5Ql`ltzs@*HGs1*sIGLmJ%TAGsYsZciGz?2zApt1uyP+5Eh$_M4090Ii>
zJp7cSiV(_@FWozcushehIqxiK5=VvPP@Nrp(yz&aB&1Xv3Uu<}x2?z<sB~~o<E2rz
zUUC!3-i374(IoKjNNIf;PANqfq&=5>w?z5Hbb!go1E`G6#+*#2qoi-ndOjh25=qqI
zkuvb5LvAu6XH&Zg9OQCi0hN=;*i4U<4m?EPKvnj<(77d54Wumgj&3n^Fd$|1J~J*Y
zbmV&~CE@$D($?q8wz9AjsI(1%N<9TCYaI;jv`oT$z5#0ai`YBY&rYRB1Zkyr`X%x$
z%bQONyU;U8S<wj;eKR?6p!Q%$MC6fTwPihXK1g-*WTxYjw~*6eq}xDvt88{!RvZvx
zBvTl^2`ukT&(rU#Y;uUWN=C{I&9pV7K&9OsqdJva11YT$NSTL>uN*r$LPaf$z!|&1
zC`F<wBlZKSy!Sxj5PQ=(kT6Lim8OLzEK)A`C#({|3sys?UnyI+4xx%y)4LtPpb-Ah
z9Wj!|s)BmRQWdnCBc}$0D%MbyNP<NJRO)uYsvGh^v4<+m0vjWP*HL;4R`r#HEU}&C
zRIo_1Te8jJE^D?Ylz3F-$RRtlUetyzSnGTsYprrpM|(a1rOxL<)kF1yxka;7IT{be
zOgRkzXsH6#n+2<tHmQ|VfZU@=Mp66IQUO*KL87zrWg4JZ3Y9YtsI_fe*N}OLBp_5-
z?(0~&S?lG)@!|CL*2@q>1g*c~J*7V{w3-<kx=5gP&+--nsw4Atk}rpQhUkbUoGgrw
zz)NV#X>zXwKN^cbY3#5Na?J>owWahr+lT<DjR@#n)|LsZT``9v{YHkiy2Hu$;O*h$
zEAJZ1X0iV&CZPRUho|HF<jy%KzjImT1Br*Y|HeY`$!{x7<NM-rrz8G%#x1SAHew6b
zR)WCVs0*kn*yemPQc((pe68$4)ta_^d|XC9nG|UEig~h7y3#z<W-N4G+hPt-t+L3H
zw2A3PmQcb1>2^TcV+CqmM+CyzHR67aw6;Ie;0(m{BgdPSSD!^*+Jc?0O1GeTl?00x
ztm+`WV0kg!g%V>vnR9eB;b}(~tSwE@!l#{Numn#k>Y}*~tP_o3IblneS8wDQVPmXu
zOJkkp74m2;BYke1ujKPc4<)a_TE~^KZ2F8~vh}qYkY;NnVAbm|Se(4{>x5ahF@)wY
zH%=^qwMTpDt#Trj;a}FvUj~Osx+6h!F#_K$wy||SEo>cNuy9LRNQRx+>Mi;sBcgo0
z)-D*V<uL=xtg)a)MjKYJwrEU$jaCA%nrw;=PJPaDm~TFGDdj(J397{s8rWJ;pfYC)
zRQl*ZWiG)xVVWw5uuDTADve8ma*TvAX}n>DUsnbks4V7yl(n3^aVNz$bq19l=88wX
z%#M}<l{FQhGV}pSDy)Pd9@&WzPE~pfZoy>U&fGE0UA&@{2011TVaT)<$6Xyj>uW;F
zH!d^EG%e*{az!Z51gNwMAZ2I;Ql2JAnGRhBx+&ETRQB63mCbr@pq5E^VA*)UeKD?7
z<!?ZxV`8+LWp6&duFOIvt4R+GRHmsQEsJ!p%aAj>TEx#o?M88ivT)8-I_p(o%3lpD
zD;rTjLe>NfxxSZyCr~$oX?X!k6-=&I+M~Qjrbu$_FEem1{a^jzqy_4@98ORFA1zSh
zdv!|c_*|U~t3k>l_ldyR#TX#fOUBC!NSU0Su+lPD4^-aO0G0I&hUHmNM=Pb0B&~Dm
zrLT|Kr<Ff&a<!>StH^|I8cQ?QQbpuQpW7xt8IR+uUFynQK3V^!dk6hghmtND_G&y)
zp!QfsaY|(YX)KYYj6h{Uf{}hUPY+Z!8v&J$H^Ts}7y^~iJXB@o4ASnDXvnIKoNv0k
zj0PHKkO)(yZ@P>hbV`HccOf}MWiSs?*6{LWPOIYbb!v`OevM~&L0@EWq??l-hrw6T
zZlJQ;AW)eU1}g1Qpi+=Bic-Fb$8bk!%9yjLiKdRC%*v$|%L&BrPmbyvL?5V?C7`lU
zl~I1O#-CAsR&RjXy^cDJI~<@gJ4l0_EF4{W^OPW_&rP0dTE?8>q=D)UP-)XbRc6Kc
zn5Jx04iuMTVtt_Uigk=OQ$b7m#H7XrDr4>^OoUX{j3|?FOQ6y)0hM<#X^4}NOQ4oZ
zKxNvP)-=iBXDozN)-<(Qsw2-;)-Y6M<P9{p_D0VGMHCfuSW>0wY2TBEDNyOEG7w8%
zveEZvy+)wYeg|p~XP72;u0RRFl}G@UZjKg5S$_j*`I~R$IAt9Hlk%+eW7_3%x$J_T
zv}2;J=12pT*%(liRS`x0qzH~)Oh**vx<W3En9_2Cl!lu&Oq`UGK%lb03#1hz-FOF0
zsib3|oAOT}Wn#iuHZ8fsQ7Cf+Rcje^N%Xw>w!*vj-+cJ~-49><`rlqY`xP%JFxUwl
znI6Wi$5*V*jmI$;D>2(`z)C=S6R;A+-guq>bj60<0H9jk7FPSeZxUEZ9Bw=xot>J}
zo6l1`p4s~w&yRes1FTrfn*_QI^XuIS5bvz?9Nt*b>PPFZpI`j|`;Euj=X1hSA9CaO
zDUr?trt9Y!r&H<7YU}6q&yz8&II^41SIf`rn1VZrAExH}dNm0eriH%i$8*hGC}X^4
zv369kGSB(OV6oTd?(vM}Eq;A9F23euI+OQvG9#gi#qjt_j(N2h9$)d?*N@{-n#E*3
zXR+11XtUoQQ{%3F9r<0UK3(sXG^tq)0dd2c)wYf6#jG};UO&Ei(|7%N>}tXIgnPx~
zh<l??c}HR8#lP#-^L%6G`TdoSwyU3~_k69Ack}s`BE+rd3DtIoEi7jxf4~0q5t!~b
zW@&nF<oMDgzu(gH{A*$VR)^;4<#1k(hus~T*ZU=kZH&nHzW07v=JAyl`>WN^$=bLz
zdP*x%8~vdB7oXn9hBvhPRRK<(Z|n%quYBU?nwcZFEw;<^jb>$X*GgmV&GoF525vpy
zxMr49{(6&aoDvf4w>mt|(RR<T)TFL?!|ylA8|$ffxmszxUd`}Wg`~x!RFAKecdk}L
zS6;K%e1x^BUC^}BNCJ;H&X&HGDn;e_#yRI=yD@1z->5i`H>~9DxiL6A-XwUrWL_z`
zUdatN%S{}@W#>w9_3GF8g44>2RM(Fq2Tiif7j{-&9=Q7XMgb`YBJlfbW8?Y#Qj^E|
z@s-=8pRN8Uz{*|44L}tsgaSXm-!}(<wCxuk0PK0c#n1D319h)Q$@7Gzy9*Y1J~qGj
zeHOFd4)XgWb>16TU!Gs-@Vh?Z%RJ}36`BwTgPH|JgPY&S&{of5PF7l-ZtQrq9r4EV
zxejVJoT5}C5d<asol8!DxM6=317H}%;s`Fe=-zBA1Q6*;>H^pwoP+`&_}HH?{sgGF
qnwP2f2SA|!pfVKzIJuoEcnENA`;+hhT2=UqCS0;){{Pc2{`7xcczBoq

delta 16
Xcmcb4isjK4#tku5Y+MQowOq9TLLCNK

diff --git a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
new file mode 100644
index 0000000..a01bf0d
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_benchmarker.py
@@ -0,0 +1,20 @@
+from typing import List
+
+from polystar.common.models.object import ArmorColor
+from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
+from research.robots_at_robots.armor_color.armor_color_dataset import make_armor_color_dataset_generator
+from research.robots_at_robots.evaluation.benchmark import make_armor_value_benchmarker
+
+
+def make_armor_color_benchmarker(
+    train_roco_datasets: List[ROCODatasetBuilder], test_roco_datasets: List[ROCODatasetBuilder], experiment_name: str
+):
+    dataset_generator = make_armor_color_dataset_generator()
+    return make_armor_value_benchmarker(
+        train_roco_datasets=train_roco_datasets,
+        test_roco_datasets=test_roco_datasets,
+        evaluation_project="armor-color",
+        experiment_name=experiment_name,
+        classes=list(ArmorColor),
+        dataset_generator=dataset_generator,
+    )
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py b/robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py
deleted file mode 100644
index a24ad1a..0000000
--- a/robots-at-robots/research/robots_at_robots/armor_color/armor_color_pipeline_reporter_factory.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import List
-
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.robots_at_robots.armor_color.armor_color_dataset import make_armor_color_dataset_generator
-from research.robots_at_robots.evaluation.image_pipeline_evaluation_reporter import ImagePipelineEvaluationReporter
-from research.robots_at_robots.evaluation.image_pipeline_evaluator import ImagePipelineEvaluator
-
-
-class ArmorColorPipelineReporterFactory:
-    @staticmethod
-    def from_roco_datasets(
-        train_roco_datasets: List[ROCODatasetBuilder],
-        test_roco_datasets: List[ROCODatasetBuilder],
-        experiment_name: str,
-    ):
-        return ImagePipelineEvaluationReporter(
-            evaluator=ImagePipelineEvaluator(
-                train_roco_datasets=train_roco_datasets,
-                test_roco_datasets=test_roco_datasets,
-                image_dataset_generator=make_armor_color_dataset_generator(),
-            ),
-            evaluation_project="armor-color",
-            experiment_name=experiment_name,
-        )
diff --git a/robots-at-robots/research/robots_at_robots/armor_color/baseline_experiments.py b/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
similarity index 85%
rename from robots-at-robots/research/robots_at_robots/armor_color/baseline_experiments.py
rename to robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
index 703fefe..1ac6f2b 100644
--- a/robots-at-robots/research/robots_at_robots/armor_color/baseline_experiments.py
+++ b/robots-at-robots/research/robots_at_robots/armor_color/benchmark.py
@@ -13,9 +13,7 @@ from polystar.common.pipeline.classification.random_model import RandomClassifie
 from polystar.common.pipeline.classification.rule_based_classifier import RuleBasedClassifierABC
 from polystar.common.pipeline.pipe_abc import PipeABC
 from research.common.datasets.roco.zoo.roco_dataset_zoo import ROCODatasetsZoo
-from research.robots_at_robots.armor_color.armor_color_pipeline_reporter_factory import (
-    ArmorColorPipelineReporterFactory,
-)
+from research.robots_at_robots.armor_color.armor_color_benchmarker import make_armor_color_benchmarker
 
 
 class ArmorColorPipeline(ClassificationPipeline):
@@ -38,20 +36,20 @@ class RedBlueComparisonClassifier(RuleBasedClassifierABC):
 if __name__ == "__main__":
     logging.getLogger().setLevel("INFO")
 
-    reporter = ArmorColorPipelineReporterFactory.from_roco_datasets(
-        train_roco_datasets=[
+    _benchmarker = make_armor_color_benchmarker(
+        [
             ROCODatasetsZoo.TWITCH.T470150052,
             ROCODatasetsZoo.TWITCH.T470152289,
             ROCODatasetsZoo.TWITCH.T470149568,
             ROCODatasetsZoo.TWITCH.T470151286,
         ],
-        test_roco_datasets=[
+        [
             ROCODatasetsZoo.TWITCH.T470152838,
             ROCODatasetsZoo.TWITCH.T470153081,
             ROCODatasetsZoo.TWITCH.T470158483,
             ROCODatasetsZoo.TWITCH.T470152730,
         ],
-        experiment_name="test",
+        "test",
     )
 
     red_blue_comparison_pipeline = ArmorColorPipeline.from_pipes(
@@ -62,4 +60,4 @@ if __name__ == "__main__":
         [RGB2HSV(), Histogram2D(), LogisticRegression()], name="hsv-hist-lr",
     )
 
-    reporter.report([random_pipeline, red_blue_comparison_pipeline, hsv_hist_lr_pipeline])
+    _benchmarker.benchmark([random_pipeline, red_blue_comparison_pipeline, hsv_hist_lr_pipeline])
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
new file mode 100644
index 0000000..f4792c4
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_benchmarker.py
@@ -0,0 +1,20 @@
+from typing import List
+
+from polystar.common.models.object import ArmorDigit
+from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
+from research.robots_at_robots.armor_digit.armor_digit_dataset import make_armor_digit_dataset_generator
+from research.robots_at_robots.evaluation.benchmark import make_armor_value_benchmarker
+
+
+def make_armor_digit_benchmarker(
+    train_roco_datasets: List[ROCODatasetBuilder], test_roco_datasets: List[ROCODatasetBuilder], experiment_name: str
+):
+    dataset_generator = make_armor_digit_dataset_generator()
+    return make_armor_value_benchmarker(
+        train_roco_datasets=train_roco_datasets,
+        test_roco_datasets=test_roco_datasets,
+        evaluation_project="armor-digit",
+        experiment_name=experiment_name,
+        classes=list(ArmorDigit),
+        dataset_generator=dataset_generator,
+    )
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py b/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py
deleted file mode 100644
index 6c5f9a0..0000000
--- a/robots-at-robots/research/robots_at_robots/armor_digit/armor_digit_pipeline_reporter_factory.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import List
-
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.robots_at_robots.armor_digit.armor_digit_dataset import make_armor_digit_dataset_generator
-from research.robots_at_robots.evaluation.image_pipeline_evaluation_reporter import ImagePipelineEvaluationReporter
-from research.robots_at_robots.evaluation.image_pipeline_evaluator import ImagePipelineEvaluator
-
-
-class ArmorDigitPipelineReporterFactory:
-    @staticmethod
-    def from_roco_datasets(
-        train_roco_datasets: List[ROCODatasetBuilder],
-        test_roco_datasets: List[ROCODatasetBuilder],
-        experiment_name: str,
-    ):
-        return ImagePipelineEvaluationReporter(
-            evaluator=ImagePipelineEvaluator(
-                train_roco_datasets=train_roco_datasets,
-                test_roco_datasets=test_roco_datasets,
-                image_dataset_generator=make_armor_digit_dataset_generator(),
-            ),
-            evaluation_project="armor-digit",
-            experiment_name=experiment_name,
-        )
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py b/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
index 757247e..1b48d0e 100644
--- a/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/benchmark.py
@@ -3,8 +3,6 @@ import warnings
 from pathlib import Path
 from typing import List, Sequence, Tuple
 
-import seaborn as sns
-from cv2.cv2 import resize
 from keras_preprocessing.image import ImageDataGenerator
 from numpy import asarray
 from tensorflow_core.python.keras import Input, Model, Sequential
@@ -15,16 +13,15 @@ from tensorflow_core.python.keras.optimizer_v2.adam import Adam
 from tensorflow_core.python.keras.optimizer_v2.gradient_descent import SGD
 from tensorflow_core.python.keras.utils.np_utils import to_categorical
 
+from polystar.common.image_pipeline.preprocessors.normalise import Normalise
+from polystar.common.image_pipeline.preprocessors.resize import Resize
 from polystar.common.models.image import Image
 from polystar.common.models.object import ArmorDigit
 from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
 from polystar.common.pipeline.classification.classifier_abc import ClassifierABC
 from polystar.common.pipeline.classification.random_model import RandomClassifier
-from polystar.common.pipeline.pipe_abc import PipeABC
 from research.common.datasets.roco.zoo.roco_dataset_zoo import ROCODatasetsZoo
-from research.robots_at_robots.armor_digit.armor_digit_pipeline_reporter_factory import (
-    ArmorDigitPipelineReporterFactory,
-)
+from research.robots_at_robots.armor_digit.armor_digit_benchmarker import make_armor_digit_benchmarker
 
 
 class ArmorDigitPipeline(ClassificationPipeline):
@@ -45,14 +42,14 @@ class KerasClassifier(ClassifierABC):
         return ImageDataGenerator(rotation_range=45, zoom_range=[0.8, 1])  # brightness_range=[0.7, 1.4]
 
     def fit(self, images: List[Image], labels: List[int]) -> "KerasClassifier":
-        n_val: int = 540  # FIXME
+        n_val: int = 371  # FIXME
         images = asarray(images)
         labels = to_categorical(asarray(labels), 5)  # FIXME
         train_images, train_labels = images[:-n_val], labels[:-n_val]
         val_images, val_labels = images[-n_val:], labels[-n_val:]
 
         batch_size = 32  # FIXME
-        train_generator = self.train_data_gen.flow(train_images, train_labels, batch_size)
+        train_generator = self.train_data_gen.flow(train_images, train_labels, batch_size=batch_size, shuffle=True)
 
         self.model.fit(
             x=train_generator,
@@ -100,19 +97,6 @@ class CNN(KerasClassifier):
         )
 
 
-class Resize(PipeABC):
-    def __init__(self, size: Tuple[int, int]):
-        self.size = size
-
-    def transform_single(self, image: Image) -> Image:
-        return resize(image, self.size)
-
-
-class Normalise(PipeABC):
-    def transform_single(self, image: Image) -> Image:
-        return image / 255
-
-
 def make_digits_cnn_pipeline(
     input_size: int, conv_blocks: Sequence[Sequence[int]], report_dir: Path, with_data_augmentation: bool, lr: float
 ) -> ArmorDigitPipeline:
@@ -186,9 +170,7 @@ if __name__ == "__main__":
     logging.getLogger("tensorflow").setLevel("ERROR")
     warnings.filterwarnings("ignore")
 
-    sns.set_style()
-
-    reporter = ArmorDigitPipelineReporterFactory.from_roco_datasets(
+    _benchmarker = make_armor_digit_benchmarker(
         train_roco_datasets=[
             # ROCODatasetsZoo.DJI.CENTRAL_CHINA,
             # ROCODatasetsZoo.DJI.FINAL,
@@ -200,32 +182,39 @@ if __name__ == "__main__":
             ROCODatasetsZoo.TWITCH.T470152289,
         ],
         test_roco_datasets=[
-            #
             ROCODatasetsZoo.TWITCH.T470152838,
             ROCODatasetsZoo.TWITCH.T470153081,
             ROCODatasetsZoo.TWITCH.T470158483,
             ROCODatasetsZoo.TWITCH.T470152730,
         ],
-        experiment_name="data_augm",
+        experiment_name="test-benchmarker",
     )
 
     random_pipeline = ArmorDigitPipeline.from_pipes([RandomClassifier()], name="random")
 
+    report_dir = _benchmarker.reporter.report_dir
     cnn_pipelines = [
-        make_digits_cnn_pipeline(32, ((32, 32), (64, 64)), reporter.report_dir, with_data_augmentation=True, lr=lr)
-        for lr in (1e-2, 5e-3, 2e-3, 1e-3, 5e-4, 2e-4)
-    ] + [
         make_digits_cnn_pipeline(
-            64, ((32,), (64, 64), (64, 64)), reporter.report_dir, with_data_augmentation=False, lr=lr
+            32, ((32, 32), (64, 64)), report_dir, with_data_augmentation=with_data_augmentation, lr=lr,
         )
-        for lr in (5e-2, 2e-2, 1e-2, 5e-3, 2e-3, 1e-3)
+        for with_data_augmentation in [False]
+        for lr in [2.5e-2, 1.6e-2, 1e-2, 6.3e-3, 4e-4]
     ]
+    # cnn_pipelines = [
+    #     make_digits_cnn_pipeline(
+    #         64, ((32,), (64, 64), (64, 64)), reporter.report_dir, with_data_augmentation=True, lr=lr
+    #     )
+    #     for with_data_augmentation in [True, False]
+    #     for lr in (5.6e-2, 3.1e-2, 1.8e-2, 1e-2, 5.6e-3, 3.1e-3, 1.8e-3, 1e-3)
+    # ]
 
     vgg16_pipelines = [
-        make_vgg16_pipeline(reporter.report_dir, input_size=32, with_data_augmentation=True, lr=lr)
+        make_vgg16_pipeline(report_dir, input_size=32, with_data_augmentation=False, lr=lr)
         for lr in (1e-5, 5e-4, 2e-4, 1e-4, 5e-3)
     ]
 
-    logging.info(f"Run `tensorboard --logdir={reporter.report_dir}` for realtime logs")
+    logging.info(f"Run `tensorboard --logdir={report_dir}` for realtime logs")
 
-    reporter.report([random_pipeline, *cnn_pipelines, *vgg16_pipelines])
+    _benchmarker.benchmark(
+        [random_pipeline,]
+    )
diff --git a/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py b/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py
index 943d412..394a1c4 100644
--- a/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py
+++ b/robots-at-robots/research/robots_at_robots/armor_digit/clean_datasets.py
@@ -18,7 +18,18 @@ if __name__ == "__main__":
     _armor_digit_dataset = (
         make_armor_digit_dataset_generator()
         .from_roco_dataset(_roco_dataset)
-        .skip((1009 - 117) + (1000 - 86) + (1000 - 121) + (1000 - 138) + (1000 - 137))
+        .skip(
+            (1009 - 117)
+            + (1000 - 86)
+            + (1000 - 121)
+            + (1000 - 138)
+            + (1000 - 137)
+            + (1000 - 154)
+            + (1000 - 180)
+            + (1000 - 160)
+            + (1000 - 193)
+            + (1000 - 80)
+        )
         .cap(1000)
     )
 
diff --git a/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py b/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py
index 7b4ce98..4aafd34 100644
--- a/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py
+++ b/robots-at-robots/research/robots_at_robots/dataset/armor_value_dataset_generator.py
@@ -5,6 +5,7 @@ from polystar.common.filters.exclude_filter import ExcludeFilter
 from polystar.common.filters.filter_abc import FilterABC
 from polystar.common.filters.pass_through_filter import PassThroughFilter
 from research.common.dataset.cleaning.dataset_changes import DatasetChanges
+from research.common.datasets.image_dataset import FileImageDataset
 from research.common.datasets.image_file_dataset_builder import DirectoryDatasetBuilder
 from research.common.datasets.lazy_dataset import TargetT
 from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
@@ -28,12 +29,13 @@ class ArmorValueDatasetGenerator(Generic[TargetT]):
         self.task_name = task_name
         self.targets_filter = targets_filter or PassThroughFilter()
 
-    def from_roco_datasets(self, roco_datasets: List[ROCODatasetBuilder]) -> List[DirectoryDatasetBuilder[TargetT]]:
-        return [self.from_roco_dataset(roco_dataset) for roco_dataset in roco_datasets]
+    # FIXME signature inconsistency across methods
+    def from_roco_datasets(self, roco_datasets: List[ROCODatasetBuilder]) -> List[FileImageDataset[TargetT]]:
+        return [self.from_roco_dataset(roco_dataset).to_file_images().build() for roco_dataset in roco_datasets]
 
     def from_roco_dataset(self, roco_dataset_builder: ROCODatasetBuilder) -> DirectoryDatasetBuilder[TargetT]:
         cache_dir = roco_dataset_builder.main_dir / self.task_name
-        dataset_name = f"{roco_dataset_builder.name}_armor_{self.task_name}"
+        dataset_name = roco_dataset_builder.name
 
         ArmorValueDatasetCache(roco_dataset_builder, cache_dir, dataset_name, self.target_factory).generate_if_needed()
 
diff --git a/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py b/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py
index 0212bf8..c3a4d34 100644
--- a/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py
+++ b/robots-at-robots/research/robots_at_robots/demos/demo_pipeline.py
@@ -16,7 +16,7 @@ from polystar.common.utils.tensorflow import patch_tf_v2
 from polystar.common.view.plt_results_viewer import PltResultViewer
 from polystar.robots_at_robots.dependency_injection import make_injector
 from research.common.datasets.roco.zoo.roco_dataset_zoo import ROCODatasetsZoo
-from research.robots_at_robots.armor_color.baseline_experiments import (
+from research.robots_at_robots.armor_color.benchmark import (
     ArmorColorPipeline,
     MeanChannels,
     RedBlueComparisonClassifier,
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py b/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
new file mode 100644
index 0000000..045b13d
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/benchmark.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+from typing import List
+
+from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
+from research.common.datasets.image_dataset import FileImageDataset
+from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
+from research.robots_at_robots.dataset.armor_value_dataset_generator import ArmorValueDatasetGenerator
+from research.robots_at_robots.evaluation.image_pipeline_evaluation_reporter import ImagePipelineEvaluationReporter
+from research.robots_at_robots.evaluation.image_pipeline_evaluator import ImageClassificationPipelineEvaluator
+from research.robots_at_robots.evaluation.metrics.f1 import F1Metric
+from research.robots_at_robots.evaluation.trainer import ImageClassificationPipelineTrainer
+
+
+@dataclass
+class Benchmarker:
+    def __init__(
+        self,
+        train_datasets: List[FileImageDataset],
+        test_datasets: List[FileImageDataset],
+        evaluation_project: str,
+        experiment_name: str,
+        classes: List,
+    ):
+        self.trainer = ImageClassificationPipelineTrainer(train_datasets)
+        self.evaluator = ImageClassificationPipelineEvaluator(train_datasets, test_datasets)
+        self.reporter = ImagePipelineEvaluationReporter(
+            evaluation_project, experiment_name, classes, other_metrics=[F1Metric()]
+        )
+
+    def benchmark(self, pipelines: List[ClassificationPipeline]):
+        self.trainer.train_pipelines(pipelines)
+        self.reporter.report(self.evaluator.evaluate_pipelines(pipelines))
+
+
+def make_armor_value_benchmarker(
+    train_roco_datasets: List[ROCODatasetBuilder],
+    test_roco_datasets: List[ROCODatasetBuilder],
+    evaluation_project: str,
+    experiment_name: str,
+    dataset_generator: ArmorValueDatasetGenerator,
+    classes: List,
+):
+    return Benchmarker(
+        dataset_generator.from_roco_datasets(train_roco_datasets),
+        dataset_generator.from_roco_datasets(test_roco_datasets),
+        evaluation_project=evaluation_project,
+        experiment_name=experiment_name,
+        classes=classes,
+    )
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
index 6cd66e5..72996a9 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluation_reporter.py
@@ -1,10 +1,8 @@
 from collections import Counter
-from dataclasses import dataclass, field
-from enum import Enum
+from dataclasses import InitVar, dataclass, field
 from math import log
 from os.path import relpath
-from pathlib import Path
-from typing import Dict, Generic, Iterable, List, Optional, Tuple
+from typing import Generic, List, Optional, Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -12,189 +10,233 @@ import seaborn as sns
 from matplotlib.axes import Axes, logging
 from matplotlib.figure import Figure
 from pandas import DataFrame
+from sklearn.metrics import classification_report, confusion_matrix
 
 from polystar.common.pipeline.classification.classification_pipeline import EnumT
-from polystar.common.pipeline.pipeline import Pipeline
 from polystar.common.utils.dataframe import Format, format_df_row, format_df_rows, make_formater
 from polystar.common.utils.markdown import MarkdownFile
 from polystar.common.utils.time import create_time_id
 from research.common.constants import DSET_DIR, EVALUATION_DIR
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.robots_at_robots.evaluation.image_pipeline_evaluator import (
-    ClassificationResults,
-    ImagePipelineEvaluator,
-    SetClassificationResults,
-)
-
-
-class Metric(Enum):
-    F1_WEIGHTED_AVG = ("f1-score", "weighted avg")
-    ACCURACY = ("precision", "accuracy")
-
-    def __str__(self):
-        if self.value[1] == "accuracy":
-            return "accuracy"
-        return " ".join(self.value)
-
-    def __getitem__(self, item):
-        return self.value[item]
+from research.robots_at_robots.evaluation.metrics.accuracy import AccuracyMetric
+from research.robots_at_robots.evaluation.metrics.metric_abc import MetricABC
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance, ClassificationPerformances
+from research.robots_at_robots.evaluation.set import Set
 
 
 @dataclass
 class ImagePipelineEvaluationReporter(Generic[EnumT]):
-    evaluator: ImagePipelineEvaluator[EnumT]
     evaluation_project: str
     experiment_name: str
-    main_metric: Metric = Metric.F1_WEIGHTED_AVG
-    other_metrics: List[Metric] = field(default_factory=lambda: [Metric.ACCURACY])
+    classes: List[EnumT]
+    main_metric: MetricABC = field(default_factory=AccuracyMetric)
+    other_metrics: InitVar[List[MetricABC]] = None
+    _mf: MarkdownFile = field(init=False)
+    _performances: ClassificationPerformances = field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self, other_metrics: List[MetricABC]):
         self.report_dir = EVALUATION_DIR / self.evaluation_project / f"{create_time_id()}_{self.experiment_name}"
+        self.all_metrics: List[MetricABC] = [self.main_metric] + (other_metrics or [])
 
-    def report(self, pipelines: Iterable[Pipeline]):
-        logging.info(f"Running experiment {self.experiment_name}")
-
-        pipeline2results = self.evaluator.evaluate_pipelines(pipelines)
+    def report(self, performances: ClassificationPerformances):
+        sns.set()
+        self._performances = performances
+        with MarkdownFile(self.report_dir / "report.md") as self._mf:
 
-        with MarkdownFile(self.report_dir / "report.md") as mf:
-            mf.title(f"Evaluation report")
-            self._report_datasets(mf)
-            self._report_aggregated_results(mf, pipeline2results, self.report_dir)
-            self._report_pipelines_results(mf, pipeline2results)
+            self._mf.title(f"Evaluation report")
+            self._report_datasets()
+            self._report_aggregated_results()
+            self._report_pipelines_results()
 
             logging.info(f"Report generated at file:///{self.report_dir/'report.md'}")
 
-    def _report_datasets(self, mf: MarkdownFile):
-        mf.title("Datasets", level=2)
+    def _report_datasets(self):
+        self._mf.title("Datasets", level=2)
 
-        mf.title("Training", level=3)
-        self._report_dataset(
-            mf, self.evaluator.train_roco_datasets, self.evaluator.train_dataset_sizes, self.evaluator.train_labels
-        )
+        self._mf.title("Training", level=3)
+        self._report_dataset(self._performances.train)
 
-        mf.title("Testing", level=3)
-        self._report_dataset(
-            mf, self.evaluator.test_roco_datasets, self.evaluator.test_dataset_sizes, self.evaluator.test_labels
-        )
+        self._mf.title("Testing", level=3)
+        self._report_dataset(self._performances.test)
 
-    @staticmethod
-    def _report_dataset(
-        mf: MarkdownFile, roco_datasets: List[ROCODatasetBuilder], dataset_sizes: List[int], labels: List[EnumT]
-    ):
-        total = len(labels)
-        labels = [str(label) for label in labels]
-        mf.paragraph(f"{total} images")
+    def _report_dataset(self, performances: ClassificationPerformances):
         df = (
-            DataFrame(
-                {
-                    dataset.name: Counter(labels[start:end])
-                    for dataset, start, end in zip(
-                        roco_datasets, np.cumsum([0] + dataset_sizes), np.cumsum(dataset_sizes)
-                    )
-                }
-            )
+            DataFrame({perf.dataset_name: Counter(perf.labels) for perf in performances})
             .fillna(0)
             .sort_index()
+            .astype(int)
         )
-        df["Total"] = sum([df[d.name] for d in roco_datasets])
-        df["Repartition"] = (df["Total"] / total).map("{:.1%}".format)
-        mf.table(df)
-
-    def _report_aggregated_results(
-        self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[EnumT]], report_dir: Path
-    ):
-        fig_scores, fig_times, aggregated_results = self._aggregate_results(pipeline2results)
-        aggregated_scores_image_name = "aggregated_scores.png"
-        fig_scores.savefig(report_dir / aggregated_scores_image_name)
-        aggregated_times_image_name = "aggregated_times.png"
-        fig_times.savefig(report_dir / aggregated_times_image_name)
-
-        mf.title("Aggregated results", level=2)
-        mf.image(aggregated_scores_image_name)
-        mf.image(aggregated_times_image_name)
-        mf.paragraph("On test set:")
-        mf.table(aggregated_results[aggregated_results["set"] == "test"].drop(columns="set"))
-        mf.paragraph("On train set:")
-        mf.table(aggregated_results[aggregated_results["set"] == "train"].drop(columns="set"))
-
-    def _report_pipelines_results(self, mf: MarkdownFile, pipeline2results: Dict[str, ClassificationResults[EnumT]]):
-        for pipeline_name, results in sorted(
-            pipeline2results.items(),
-            key=lambda name_results: name_results[1].test_results.report[self.main_metric[1]][self.main_metric[0]],
+        df["Total"] = df.sum(axis=1)
+        df["Repartition"] = df["Total"] / df["Total"].sum()
+        df.loc["Total"] = df.sum()
+        df.loc["Repartition"] = df.loc["Total"] / df["Total"]["Total"]
+        dset_repartition = df.loc["Repartition"].map("{:.1%}".format)
+        df["Repartition"] = df["Repartition"].map("{:.1%}".format)
+        df.loc["Repartition"] = dset_repartition
+        df.at["Total", "Repartition"] = ""
+        df.at["Repartition", "Repartition"] = ""
+        df.at["Repartition", "Total"] = ""
+        self._mf.table(df)
+
+    def _report_aggregated_results(self):
+        fig_scores, fig_times = self._make_aggregate_figures()
+
+        self._mf.title("Aggregated results", level=2)
+        self._mf.figure(fig_scores, "aggregated_scores.png")
+        self._mf.figure(fig_times, "aggregated_times.png")
+
+        self._mf.paragraph("On test set:")
+        self._mf.table(self._make_aggregated_results_for_set(Set.TRAIN))
+        self._mf.paragraph("On train set:")
+        self._mf.table(self._make_aggregated_results_for_set(Set.TEST))
+
+    def _report_pipelines_results(self):
+        for pipeline_name, performances in sorted(
+            self._performances.group_by_pipeline().items(),
+            key=lambda name_perfs: self.main_metric(name_perfs[1].test.merge()),
             reverse=True,
         ):
-            self._report_pipeline_results(mf, pipeline_name, results)
+            self._report_pipeline_results(pipeline_name, performances)
 
-    def _report_pipeline_results(self, mf: MarkdownFile, pipeline_name: str, results: ClassificationResults[EnumT]):
-        mf.title(pipeline_name, level=2)
+    def _report_pipeline_results(self, pipeline_name: str, performances: ClassificationPerformances):
+        self._mf.title(pipeline_name, level=2)
 
-        mf.paragraph(results.full_pipeline_name)
+        self._mf.title("Train results", level=3)
+        self._report_pipeline_set_results(performances, Set.TRAIN)
 
-        mf.title("Train results", level=3)
-        ImagePipelineEvaluationReporter._report_pipeline_set_results(
-            mf, results.train_results, self.evaluator.train_images_paths
-        )
+        self._mf.title("Test results", level=3)
+        self._report_pipeline_set_results(performances, Set.TEST)
 
-        mf.title("Test results", level=3)
-        ImagePipelineEvaluationReporter._report_pipeline_set_results(
-            mf, results.test_results, self.evaluator.test_images_paths
-        )
+    def _report_pipeline_set_results(self, performances: ClassificationPerformances, set_: Set):
+        performances = performances.on_set(set_)
+        perf = performances.merge()
+
+        self._mf.title("Metrics", level=4)
+        self._report_pipeline_set_metrics(performances, perf, set_)
 
-    @staticmethod
-    def _report_pipeline_set_results(
-        mf: MarkdownFile, results: SetClassificationResults[EnumT], image_paths: List[Path]
+        self._mf.title("Confusion Matrix:", level=4)
+        self._report_pipeline_set_confusion_matrix(perf)
+
+        self._mf.title("25 Mistakes examples", level=4)
+        self._report_pipeline_set_mistakes(perf)
+
+    def _report_pipeline_set_metrics(
+        self, performances: ClassificationPerformances, perf: ClassificationPerformance, set_: Set
     ):
-        mf.title("Metrics", level=4)
-        mf.paragraph(f"Inference time: {results.mean_inference_time: .2e} s/img")
-        df = DataFrame(results.report)
+        fig: Figure = plt.figure(figsize=(9, 6))
+        ax: Axes = fig.subplots()
+        sns.barplot(
+            data=DataFrame(
+                [
+                    {"dataset": performance.dataset_name, "score": metric(performance), "metric": metric.name}
+                    for performance in performances
+                    for metric in self.all_metrics
+                ]
+                + [
+                    {"dataset": performance.dataset_name, "score": len(performance) / len(perf), "metric": "support"}
+                    for performance in performances
+                ]
+            ),
+            x="dataset",
+            hue="metric",
+            y="score",
+            ax=ax,
+        )
+        ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")
+        pipeline_name = performances.performances[0].pipeline_name
+        fig.suptitle(f"{pipeline_name} performance across {set_} datasets")
+        _format_ax(ax, "{:.1%}", limits=(0, 1))
+        fig.tight_layout()
+        self._mf.figure(fig, f"{pipeline_name}_{set_}.png")
+
+        self._mf.paragraph(f"Inference time: {perf.mean_inference_time: .2e} s/img")
+        df = DataFrame(classification_report(perf.labels, perf.predictions, output_dict=True))
         format_df_rows(df, ["precision", "recall", "f1-score"], "{:.1%}")
         format_df_row(df, "support", int)
-        mf.table(df)
-        mf.title("Confusion Matrix:", level=4)
-        mf.table(DataFrame(results.confusion_matrix, index=results.unique_labels, columns=results.unique_labels))
-        mf.title("25 Mistakes examples", level=4)
-        mistakes_idx = np.random.choice(results.mistakes, min(len(results.mistakes), 25), replace=False)
+        self._mf.table(df)
+
+    def _report_pipeline_set_confusion_matrix(self, perf: ClassificationPerformance):
+        self._mf.table(
+            DataFrame(
+                confusion_matrix(perf.labels, perf.predictions), index=perf.unique_labels, columns=perf.unique_labels
+            )
+        )
+
+    def _report_pipeline_set_mistakes(self, perf: ClassificationPerformance):
+        mistakes = perf.mistakes
+        mistakes_idx = np.random.choice(mistakes, min(len(mistakes), 25), replace=False)
         relative_paths = [
-            f"![img]({relpath(str(image_paths[idx]), str(mf.markdown_path.parent))})" for idx in mistakes_idx
+            f"![img]({relpath(str(perf.examples[idx].path), str(self._mf.markdown_path.parent))})"
+            for idx in mistakes_idx
+        ]
+        images_names = [
+            f"[{perf.examples[idx].path.relative_to(DSET_DIR)}]"
+            f"({relpath(str(perf.examples[idx].path), str(self._mf.markdown_path.parent))})"
+            for idx in mistakes_idx
         ]
-        images_names = [image_paths[idx].relative_to(DSET_DIR) for idx in mistakes_idx]
-        mf.table(
+        self._mf.table(
             DataFrame(
                 {
                     "images": relative_paths,
-                    "labels": map(str, results.labels[mistakes_idx]),
-                    "predictions": map(str, results.predictions[mistakes_idx]),
+                    "labels": perf.labels[mistakes_idx],
+                    "predictions": perf.predictions[mistakes_idx],
+                    **{
+                        f"p({str(label)})": map("{:.1%}".format, perf.proba[mistakes_idx, i])
+                        for i, label in enumerate(self.classes)
+                    },
                     "image names": images_names,
                 }
             ).set_index("images")
         )
 
-    def _aggregate_results(
-        self, pipeline2results: Dict[str, ClassificationResults[EnumT]]
-    ) -> Tuple[Figure, Figure, DataFrame]:
-        sns.set_style()
-        sets = ["train", "test"]
+    def _make_aggregate_figures(self) -> Tuple[Figure, Figure]:
         df = DataFrame.from_records(
             [
                 {
-                    "pipeline": pipeline_name,
-                    str(self.main_metric): results.on_set(set_).report[self.main_metric[1]][self.main_metric[0]],
-                    "inference time": results.on_set(set_).mean_inference_time,
-                    "set": set_,
+                    "dataset": perf.dataset_name,
+                    "pipeline": perf.pipeline_name,
+                    self.main_metric.name: self.main_metric(perf),
+                    "time": perf.mean_inference_time,
+                    "set": perf.set_.name.lower(),
+                    "support": len(perf),
                 }
-                for pipeline_name, results in pipeline2results.items()
-                # for metric in [self.main_metric]  # + self.other_metrics
-                for set_ in sets
+                for perf in self._performances
             ]
-        ).sort_values(["set", str(self.main_metric)], ascending=[True, False])
+        ).sort_values(["set", self.main_metric.name], ascending=[True, False])
 
+        df[f"{self.main_metric.name} "] = list(zip(df[self.main_metric.name], df.support))
+        df["time "] = list(zip(df[self.main_metric.name], df.support))
+
+        return (
+            _cat_pipeline_results(df, f"{self.main_metric.name} ", "{:.1%}", limits=(0, 1)),
+            _cat_pipeline_results(df, "time ", "{:.2e}", log_scale=True),
+        )
+
+    def _make_aggregated_results_for_set(self, set_: Set) -> DataFrame:
+        pipeline2performances = self._performances.on_set(set_).group_by_pipeline()
+        pipeline2performance = {
+            pipeline_name: performances.merge() for pipeline_name, performances in pipeline2performances.items()
+        }
         return (
-            _cat_pipeline_results(df, str(self.main_metric), "{:.1%}", limits=(0, 1)),
-            _cat_pipeline_results(df, "inference time", "{:.2e}", log_scale=True),
-            df.set_index("pipeline"),
+            DataFrame(
+                [
+                    {
+                        "pipeline": pipeline_name,
+                        self.main_metric.name: self.main_metric(performance),
+                        "inference time": performance.mean_inference_time,
+                    }
+                    for pipeline_name, performance in pipeline2performance.items()
+                ]
+            )
+            .set_index("pipeline")
+            .sort_values(self.main_metric.name, ascending=False)
         )
 
 
+def weighted_mean(x, **kws):
+    val, weight = map(np.asarray, zip(*x))
+    return (val * weight).sum() / weight.sum()
+
+
 def _cat_pipeline_results(
     df: DataFrame, y: str, fmt: str, limits: Optional[Tuple[float, float]] = None, log_scale: bool = False
 ) -> Figure:
@@ -208,6 +250,8 @@ def _cat_pipeline_results(
         legend=False,
         col_order=["test", "train"],
         height=10,
+        estimator=weighted_mean,
+        orient="v",
     )
     grid.set_xticklabels(rotation=30, ha="right")
 
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
index 2a23706..266de79 100644
--- a/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
+++ b/robots-at-robots/research/robots_at_robots/evaluation/image_pipeline_evaluator.py
@@ -1,111 +1,57 @@
-import logging
-from dataclasses import dataclass
 from enum import Enum
-from pathlib import Path
+from itertools import chain
 from time import time
-from typing import Dict, Generic, Iterable, List, Sequence, Tuple
+from typing import Generic, Iterable, List
 
 import numpy as np
-from memoized_property import memoized_property
-from sklearn.metrics import classification_report, confusion_matrix
-from tqdm import tqdm
 
-from polystar.common.models.image import Image, load_images
-from polystar.common.pipeline.pipeline import Pipeline
+from polystar.common.models.image import file_images_to_images
+from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
+from polystar.common.utils.iterable_utils import flatten
+from research.common.datasets.image_dataset import FileImageDataset
 from research.common.datasets.lazy_dataset import TargetT
-from research.common.datasets.roco.roco_dataset_builder import ROCODatasetBuilder
-from research.common.datasets.union_dataset import UnionDataset
-from research.robots_at_robots.dataset.armor_value_dataset_generator import ArmorValueDatasetGenerator
+from research.robots_at_robots.evaluation.performance import (
+    ClassificationPerformance,
+    ClassificationPerformances,
+    ContextualizedClassificationPerformance,
+)
+from research.robots_at_robots.evaluation.set import Set
 
 
-@dataclass
-class SetClassificationResults(Generic[TargetT]):
-    labels: np.ndarray
-    predictions: np.ndarray
-    mean_inference_time: float
-
-    @property
-    def report(self) -> Dict:
-        return classification_report(self.labels, self.predictions, output_dict=True)
-
-    @property
-    def confusion_matrix(self) -> Dict:
-        return confusion_matrix(self.labels, self.predictions)
-
-    @property
-    def mistakes(self) -> Sequence[int]:
-        return np.where(self.labels != self.predictions)[0]
-
-    @memoized_property
-    def unique_labels(self) -> List[TargetT]:
-        return sorted(set(self.labels) | set(self.predictions))
-
-
-@dataclass
-class ClassificationResults(Generic[TargetT]):
-    train_results: SetClassificationResults[TargetT]
-    test_results: SetClassificationResults[TargetT]
-    full_pipeline_name: str
-
-    def on_set(self, set_: str) -> SetClassificationResults[TargetT]:
-        if set_ is "train":
-            return self.train_results
-        return self.test_results
-
-
-class ImagePipelineEvaluator(Generic[TargetT]):
+class ImageClassificationPipelineEvaluator(Generic[TargetT]):
     def __init__(
-        self,
-        train_roco_datasets: List[ROCODatasetBuilder],
-        test_roco_datasets: List[ROCODatasetBuilder],
-        image_dataset_generator: ArmorValueDatasetGenerator[TargetT],
+        self, train_datasets: List[FileImageDataset], test_datasets: List[FileImageDataset],
     ):
-        logging.info("Loading data")
-        self.train_roco_datasets = train_roco_datasets
-        self.test_roco_datasets = test_roco_datasets
-        (self.train_images_paths, self.train_images, self.train_labels, self.train_dataset_sizes) = load_datasets(
-            train_roco_datasets, image_dataset_generator
-        )
-        (self.test_images_paths, self.test_images, self.test_labels, self.test_dataset_sizes) = load_datasets(
-            test_roco_datasets, image_dataset_generator
-        )
-
-    def evaluate_pipelines(self, pipelines: Iterable[Pipeline]) -> Dict[str, ClassificationResults]:
-        tqdm_pipelines = tqdm(pipelines, desc="Training", unit="pipeline")
-        return {str(pipeline): self.evaluate_pipeline(pipeline, tqdm_pipelines) for pipeline in tqdm_pipelines}
+        self.train_datasets = train_datasets
+        self.test_datasets = test_datasets
 
-    def evaluate_pipeline(self, pipeline: Pipeline, tqdm_pipelines: tqdm) -> ClassificationResults:
-        tqdm_pipelines.set_postfix({"pipeline": pipeline.name}, True)
-        pipeline.fit(self.train_images, self.train_labels)
+    def evaluate_pipelines(self, pipelines: Iterable[ClassificationPipeline]) -> ClassificationPerformances:
+        return ClassificationPerformances(flatten(self._evaluate_pipeline(pipeline) for pipeline in pipelines))
 
-        train_results = self._evaluate_pipeline_on_set(pipeline, self.train_images, self.train_labels)
-        test_results = self._evaluate_pipeline_on_set(pipeline, self.test_images, self.test_labels)
-
-        return ClassificationResults(
-            train_results=train_results, test_results=test_results, full_pipeline_name=repr(pipeline),
+    def _evaluate_pipeline(self, pipeline: ClassificationPipeline) -> Iterable[ContextualizedClassificationPerformance]:
+        return chain(
+            self._evaluate_pipeline_on_set(pipeline, self.train_datasets, Set.TRAIN),
+            self._evaluate_pipeline_on_set(pipeline, self.test_datasets, Set.TEST),
         )
 
     @staticmethod
     def _evaluate_pipeline_on_set(
-        pipeline: Pipeline, images: List[Image], labels: List[TargetT]
-    ) -> SetClassificationResults:
-        t = time()
-        preds = pipeline.predict(images)
-        mean_time = (time() - t) / len(images)
-        return SetClassificationResults(_labels_to_numpy(labels), _labels_to_numpy(preds), mean_time)
-
-
-def load_datasets(
-    roco_datasets: List[ROCODatasetBuilder], image_dataset_generator: ArmorValueDatasetGenerator[TargetT],
-) -> Tuple[List[Path], List[Image], List[TargetT], List[int]]:
-    # TODO we should receive a list of FileImageDataset
-    datasets = [builder.build() for builder in image_dataset_generator.from_roco_datasets(roco_datasets)]
-    dataset_sizes = [len(d) for d in datasets]
-
-    dataset = UnionDataset(datasets)
-    paths, targets = list(dataset.examples), list(dataset.targets)
-    images = list(load_images(paths))
-    return paths, images, targets, dataset_sizes
+        pipeline: ClassificationPipeline, datasets: List[FileImageDataset], set_: Set
+    ) -> Iterable[ContextualizedClassificationPerformance]:
+        for dataset in datasets:
+            t = time()
+            proba, classes = pipeline.predict_proba_and_classes(file_images_to_images(dataset.examples))
+            mean_time = (time() - t) / len(dataset)
+            yield ContextualizedClassificationPerformance(
+                examples=dataset.examples,
+                labels=_labels_to_numpy(dataset.targets),
+                predictions=_labels_to_numpy(classes),
+                proba=proba,
+                mean_inference_time=mean_time,
+                set_=set_,
+                dataset_name=dataset.name,
+                pipeline_name=pipeline.name,
+            )
 
 
 def _labels_to_numpy(labels: List[Enum]) -> np.ndarray:
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/__init__.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py
new file mode 100644
index 0000000..ccfe9c7
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/metrics/accuracy.py
@@ -0,0 +1,11 @@
+from research.robots_at_robots.evaluation.metrics.metric_abc import MetricABC
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance
+
+
+class AccuracyMetric(MetricABC):
+    def __call__(self, performance: ClassificationPerformance) -> float:
+        return (performance.labels == performance.predictions).mean()
+
+    @property
+    def name(self) -> str:
+        return "accuracy"
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py
new file mode 100644
index 0000000..dd5f48a
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/metrics/f1.py
@@ -0,0 +1,30 @@
+from enum import Enum, auto
+
+from sklearn.metrics import f1_score
+
+from research.robots_at_robots.evaluation.metrics.metric_abc import MetricABC
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance
+
+
+class F1Strategy(Enum):
+    MICRO = auto()
+    MACRO = auto()
+    SAMPLES = auto()
+    WEIGHTED = auto()
+
+    def __repr__(self):
+        return self.name.lower()
+
+    __str__ = __repr__
+
+
+class F1Metric(MetricABC):
+    def __init__(self, strategy: F1Strategy = F1Strategy.MACRO):
+        self.strategy = strategy
+
+    def __call__(self, performance: ClassificationPerformance) -> float:
+        return f1_score(performance.labels, performance.predictions, average=str(self.strategy))
+
+    @property
+    def name(self) -> str:
+        return f"f1 {self.strategy}"
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py b/robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py
new file mode 100644
index 0000000..f25a0c3
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/metrics/metric_abc.py
@@ -0,0 +1,17 @@
+from abc import ABC, abstractmethod
+
+from research.robots_at_robots.evaluation.performance import ClassificationPerformance
+
+
+class MetricABC(ABC):
+    @abstractmethod
+    def __call__(self, performance: ClassificationPerformance) -> float:
+        pass
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        pass
+
+    def __repr__(self):
+        return self.name
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/performance.py b/robots-at-robots/research/robots_at_robots/evaluation/performance.py
new file mode 100644
index 0000000..33c0bc7
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/performance.py
@@ -0,0 +1,79 @@
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Sequence
+
+import numpy as np
+from memoized_property import memoized_property
+
+from polystar.common.filters.filter_abc import FilterABC
+from polystar.common.models.image import FileImage
+from polystar.common.utils.iterable_utils import flatten, group_by
+from research.robots_at_robots.evaluation.set import Set
+
+
+@dataclass
+class ClassificationPerformance:
+    examples: List[FileImage]
+    labels: np.ndarray
+    predictions: np.ndarray
+    proba: np.ndarray
+    mean_inference_time: float
+
+    @property
+    def mistakes(self) -> Sequence[int]:
+        return np.where(self.labels != self.predictions)[0]
+
+    @memoized_property
+    def unique_labels(self):
+        return sorted(set(self.labels) | set(self.predictions))
+
+    def __len__(self) -> int:
+        return len(self.labels)
+
+
+@dataclass
+class ContextualizedClassificationPerformance(ClassificationPerformance):
+    set_: Set
+    dataset_name: str
+    pipeline_name: str
+
+
+@dataclass
+class ClassificationPerformances(Iterable[ContextualizedClassificationPerformance]):
+    performances: List[ContextualizedClassificationPerformance]
+
+    @property
+    def train(self) -> "ClassificationPerformances":
+        return self.on_set(Set.TRAIN)
+
+    @property
+    def test(self) -> "ClassificationPerformances":
+        return self.on_set(Set.TEST)
+
+    def on_set(self, set_: Set) -> "ClassificationPerformances":
+        return ClassificationPerformances(SetClassificationPerformanceFilter(set_).filter(self.performances))
+
+    def group_by_pipeline(self) -> Dict[str, "ClassificationPerformances"]:
+        return {
+            name: ClassificationPerformances(performances)
+            for name, performances in group_by(self, lambda p: p.pipeline_name).items()
+        }
+
+    def merge(self) -> ClassificationPerformance:
+        return ClassificationPerformance(
+            examples=flatten(p.examples for p in self),
+            labels=np.concatenate([p.labels for p in self]),
+            predictions=np.concatenate([p.predictions for p in self]),
+            proba=np.concatenate([p.proba for p in self]),
+            mean_inference_time=np.average([p.mean_inference_time for p in self], weights=[len(p) for p in self]),
+        )
+
+    def __iter__(self):
+        return iter(self.performances)
+
+
+@dataclass
+class SetClassificationPerformanceFilter(FilterABC[ContextualizedClassificationPerformance]):
+    set_: Set
+
+    def validate_single(self, perf: ContextualizedClassificationPerformance) -> bool:
+        return perf.set_ is self.set_
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/set.py b/robots-at-robots/research/robots_at_robots/evaluation/set.py
new file mode 100644
index 0000000..6175a68
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/set.py
@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+
+
+@dataclass
+class Set(Enum):
+    TRAIN = auto()
+    VALIDATION = auto()
+    TEST = auto()
+
+    def __repr__(self):
+        return self.name.lower()
+
+    __str__ = __repr__
diff --git a/robots-at-robots/research/robots_at_robots/evaluation/trainer.py b/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
new file mode 100644
index 0000000..6731cd0
--- /dev/null
+++ b/robots-at-robots/research/robots_at_robots/evaluation/trainer.py
@@ -0,0 +1,25 @@
+from typing import Generic, List
+
+from tqdm import tqdm
+
+from polystar.common.models.image import file_images_to_images
+from polystar.common.pipeline.classification.classification_pipeline import ClassificationPipeline
+from research.common.datasets.image_dataset import FileImageDataset
+from research.common.datasets.lazy_dataset import TargetT
+from research.common.datasets.union_dataset import UnionDataset
+
+
+class ImageClassificationPipelineTrainer(Generic[TargetT]):
+    def __init__(self, training_datasets: List[FileImageDataset]):
+        train_dataset = UnionDataset(training_datasets)
+        self.images = file_images_to_images(train_dataset.examples)
+        self.labels = train_dataset.targets
+
+    def train_pipeline(self, pipeline: ClassificationPipeline):
+        pipeline.fit(self.images, self.labels)
+
+    def train_pipelines(self, pipelines: List[ClassificationPipeline]):
+        tqdm_pipelines = tqdm(pipelines, desc="Training Pipelines")
+        for pipeline in tqdm_pipelines:
+            tqdm_pipelines.set_postfix({"pipeline": pipeline.name}, True)
+            self.train_pipeline(pipeline)
-- 
GitLab