Pipelines moduleΒΆ
The pipelines module contains the definitions of our scoring pipelines. The risk scoring pipelines are implemented using the imbalanced-learn Pipeline allowing for defining sequences of resampling, preprocessing and modeling steps as one estimators. See scikit-learn Pipelines and composite estimators for more information.
# Copyright (C) 2021, Pyronear contributors.
# This program is licensed under the GNU Affero General Public License version 3.
# See LICENSE or go to <https://www.gnu.org/licenses/agpl-3.0.txt> for full license details.
from imblearn.pipeline import Pipeline
from .transformers import (
TargetDiscretizer,
CategorySelector,
Imputer,
LagTransformer,
FeatureSubsetter,
)
from .utils import discretizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pyro_risks.config as cfg
__all__ = ["rf_pipeline", "xgb_pipeline"]
# pipeline base steps definition
base_steps = [
(
"filter_dep",
CategorySelector(variable=cfg.ZONE_VAR, category=cfg.SELECTED_DEP),
),
(
"add_lags",
LagTransformer(
date_column=cfg.DATE_VAR,
zone_column=cfg.ZONE_VAR,
columns=cfg.LAG_ERA5T_VARS,
),
),
("imputer", Imputer(columns=cfg.MODEL_ERA5T_VARS, strategy="median")),
("binarize_target", TargetDiscretizer(discretizer=discretizer)),
("subset_features", FeatureSubsetter(columns=cfg.MODEL_ERA5T_VARS)),
]
# Add estimator to base step lists
xgb_steps = [*base_steps, ("xgboost", XGBClassifier(**cfg.XGB_PARAMS))]
rf_steps = [*base_steps, ("random_forest", RandomForestClassifier(**cfg.RF_PARAMS))]
# Define sklearn / imblearn pipelines
xgb_pipeline = Pipeline(xgb_steps)
rf_pipeline = Pipeline(rf_steps)