diff --git a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py index f2160b5f55e6cacbd108eb6dd65ea0b4a94dd6ea..5879dedc199b08926ff43352e050be4df42b388d 100644 --- a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py +++ b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluation_reporter.py @@ -86,19 +86,25 @@ class ImagePipelineEvaluationReporter: mf.title("Train results", level=3) mf.paragraph(f"Inference time: {results.train_mean_inference_time: .2e} s/img") - ImagePipelineEvaluationReporter._report_pipeline_set_report(mf, results.train_report) + ImagePipelineEvaluationReporter._report_pipeline_set_report( + mf, results.train_report, results.train_confusion_matrix + ) mf.title("Test results", level=3) mf.paragraph(f"Inference time: {results.test_mean_inference_time: .2e} s/img") - ImagePipelineEvaluationReporter._report_pipeline_set_report(mf, results.test_report) + ImagePipelineEvaluationReporter._report_pipeline_set_report( + mf, results.test_report, results.test_confusion_matrix + ) @staticmethod - def _report_pipeline_set_report(mf: MarkdownFile, set_report: Dict): + def _report_pipeline_set_report(mf: MarkdownFile, set_report: Dict, confusion_matrix: Dict): df = DataFrame(set_report) format_df_rows(df, ["precision", "recall", "f1-score"], "{:.1%}") format_df_row(df, "support", int) mf.table(df) + mf.paragraph("Confusion Matrix:") + mf.table(DataFrame(confusion_matrix)) def _aggregate_results(self, pipeline2results: Dict[str, ClassificationResults]) -> DataFrame: main_metric_name = f"{self.main_metric[0]} {self.main_metric[1]}" diff --git a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py index 993aa88d3dbd6a3dc64c75ca354c84e30641dc06..c8fe4c7feea2c9d0b57d125a8d0a3c8fd1fb384e 100644 --- a/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py +++ b/common/research_common/image_pipeline_evaluation/image_pipeline_evaluator.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from time import time from typing import List, Tuple, Dict, Any, Iterable -from sklearn.metrics import classification_report +from sklearn.metrics import classification_report, confusion_matrix from polystar.common.image_pipeline.image_pipeline import ImagePipeline from polystar.common.models.image import Image @@ -14,8 +14,10 @@ from research_common.image_pipeline_evaluation.image_dataset_generator import Im @dataclass class ClassificationResults: train_report: Dict + train_confusion_matrix: Dict train_mean_inference_time: float test_report: Dict + test_confusion_matrix: Dict test_mean_inference_time: float full_pipeline_name: str @@ -45,20 +47,26 @@ class ImagePipelineEvaluator: pipeline.fit(self.train_images, self.train_labels) logging.info(f"Infering") - train_report, train_time = self._evaluate_on_set(pipeline, self.train_images, self.train_labels) - test_report, test_time = self._evaluate_on_set(pipeline, self.test_images, self.test_labels) + train_report, train_confusion_matrix, train_time = self._evaluate_on_set( + pipeline, self.train_images, self.train_labels + ) + test_report, test_confusion_matrix, test_time = self._evaluate_on_set( + pipeline, self.test_images, self.test_labels + ) return ClassificationResults( train_report=train_report, test_report=test_report, train_mean_inference_time=train_time, test_mean_inference_time=test_time, + train_confusion_matrix=train_confusion_matrix, + test_confusion_matrix=test_confusion_matrix, full_pipeline_name=repr(pipeline), ) @staticmethod - def _evaluate_on_set(pipeline: ImagePipeline, images: List[Image], labels: List[Any]) -> Tuple[Dict, float]: + def _evaluate_on_set(pipeline: ImagePipeline, images: List[Image], labels: List[Any]) -> Tuple[Dict, Dict, float]: t = time() preds = pipeline.predict(images) mean_time = (time() - t) / len(images) - return classification_report(labels, preds, output_dict=True), mean_time + return classification_report(labels, preds, output_dict=True), confusion_matrix(labels, preds), mean_time