Source code for pyro_risks.pipeline.evaluate

# Copyright (C) 2021, Pyronear contributors.

# This program is licensed under the GNU Affero General Public License version 3.
# See LICENSE or go to <https://www.gnu.org/licenses/agpl-3.0.txt> for full license details.

from typing import Union, Optional
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from plot_metric.functions import BinaryClassification
from pyro_risks.models import discretizer
from pyro_risks.pipeline.load import load_dataset

import matplotlib.pyplot as plt
import imblearn.pipeline as pp
import pyro_risks.config as cfg

import pandas as pd
import numpy as np

import sys
import os
import json
import joblib

__all__ = [
    "save_classification_reports",
    "save_classification_plots",
    "evaluate_pipeline",
]


[docs]def save_classification_reports( y_true: np.ndarray, y_pred: np.ndarray, prefix: Optional[str] = None, destination: Optional[str] = None, ) -> None: """ Build and save binary classification metrics reports. Args: y_true: Ground truth (correct) labels. y_pred: Predicted labels, as returned by a calibrated classifier. prefix: Classification report prefix i.e. pipeline name. Defaults to None. destination: Folder where the report should be saved. Defaults to ``METADATA_REGISTRY``. """ destination = cfg.METADATA_REGISTRY if destination is None else destination fname = ( "classification_report" if prefix is None else prefix + "_classification_report" ) json_report_path = os.path.join(destination, fname + ".json") csv_report_path = os.path.join(destination, fname + ".csv") report = classification_report(y_true, y_pred, output_dict=True) report.pop("accuracy") report.pop("macro avg") report.pop("weighted avg") # JSON report for tracking metrics with open(json_report_path, "w") as fp: json.dump(obj=report, fp=fp) # CSV report for plotting classification report pd.DataFrame(report).transpose().round(3).to_csv(csv_report_path) print(classification_report(y_true, y_pred))
[docs]def save_classification_plots( y_true: np.ndarray, y_proba: np.ndarray, threshold: np.float64, prefix: Optional[str] = None, destination: Optional[str] = None, ) -> None: """ Build and save binary classification performance evaluation plots. Args: y_true: Ground truth (correct) labels. y_pred: Predicted probabilities of the positive class returned by a classifier. threshold: Classification pipeline optimal threshold. prefix: Classification plots prefix i.e. pipeline name. Defaults to None. destination: Folder where the report should be saved. Defaults to ``METADATA_REGISTRY``. """ destination = cfg.METADATA_REGISTRY if destination is None else destination fname = ( "classification_plots.png" if prefix is None else prefix + "_classification_plots.png" ) path = os.path.join(destination, fname) bc = BinaryClassification(y_true, y_proba, labels=["No fire", "Fire"]) plt.figure(figsize=(15, 10)) plt.subplot2grid(shape=(2, 6), loc=(0, 0), colspan=2) bc.plot_roc_curve(threshold=threshold) plt.subplot2grid((2, 6), (0, 2), colspan=2) bc.plot_precision_recall_curve(threshold=threshold) plt.subplot2grid((2, 6), (0, 4), colspan=2) bc.plot_class_distribution(threshold=threshold) plt.subplot2grid((2, 6), (1, 1), colspan=2) bc.plot_confusion_matrix(threshold=threshold) plt.subplot2grid((2, 6), (1, 3), colspan=2) bc.plot_confusion_matrix(threshold=threshold, normalize=True) plt.savefig(path)
[docs]def evaluate_pipeline( X: pd.DataFrame, y: pd.Series, pipeline: Union[pp.Pipeline, str], threshold: str, prefix: Optional[str] = None, destination: Optional[str] = None, ) -> None: """ Build and save binary classification evaluation reports. Args: X: Training dataset features pd.DataFrame. y: Training dataset target pd.Series. pipeline: imbalanced-learn preprocessing pipeline or path to pipeline. threshold: Classification pipeline optimal threshold path. prefix: Classification reports prefix i.e. pipeline name. Defaults to None. destination: Folder where the report should be saved. Defaults to ``METADATA_REGISTRY``. """ # setup _, X_test, _, y_test = train_test_split( X, y, test_size=cfg.TEST_SIZE, random_state=cfg.RANDOM_STATE ) if not isinstance(pipeline, pp.Pipeline): pipeline = joblib.load(pipeline) y_proba = pipeline.predict_proba(X_test) with open(threshold, "r") as file: optimal_threshold = json.load(file) def predict(x): return 1 if x > optimal_threshold["threshold"] else 0 vpredict = np.vectorize(predict) vdiscretizer = np.vectorize(discretizer) y_pred = vpredict(y_proba[:, 1]) y_test = vdiscretizer(y_test) save_classification_reports( y_true=y_test, y_pred=y_pred, prefix=prefix, destination=destination ) save_classification_plots( y_true=y_test, y_proba=y_proba[:, 1], threshold=optimal_threshold["threshold"], prefix=prefix, destination=destination, )