# Copyright (C) 2021, Pyronear contributors.
# This program is licensed under the GNU Affero General Public License version 3.
# See LICENSE or go to <https://www.gnu.org/licenses/agpl-3.0.txt> for full license details.
from typing import List, Union, Optional, Tuple, Callable
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from .utils import check_xy, check_x
import pandas as pd
import numpy as np
[docs]class TargetDiscretizer(BaseEstimator):
"""Discretize numerical target variable.
The `TargetDiscretizer` transformer maps target variable values to discrete values using
a user defined function.
Parameters:
discretizer: user defined function.
"""
def __init__(self, discretizer: Callable) -> None:
if callable(discretizer):
self.discretizer = discretizer
else:
raise TypeError(f"{self.__class__.__name__} constructor expect a callable")
[docs] def fit_resample(
self, X: pd.DataFrame, y: pd.Series
) -> Tuple[pd.DataFrame, pd.Series]:
"""Discretize the target variable.
The `fit_resample` method allows for discretizing the target variable.
The method does not resample the dataset, the naming convention ensure
the compatibility of the transformer with imbalanced-learn `Pipeline`
object.
Args:
X: Training dataset features
y: Training dataset target
Returns:
Training dataset features and target tuple.
"""
X, y = check_xy(X, y)
y = y.apply(self.discretizer)
return X, y
[docs]class CategorySelector(BaseEstimator):
"""Select features and targets rows.
The `CategorySelector` transformer select features and targets rows
belonging to given variable categories.
Parameters:
variable: variable to be used for selection.
category: modalities to be selected.
"""
def __init__(self, variable: str, category: Union[str, list]) -> None:
self.variable = variable
# Catch or prevent key errors
if isinstance(category, str):
self.category = [category]
elif isinstance(category, list):
self.category = category
else:
raise TypeError(
f"{self.__class__.__name__} constructor category argument expect a string or a list"
)
[docs] def fit_resample(
self, X: pd.DataFrame, y: Optional[pd.Series] = None
) -> Tuple[pd.DataFrame, pd.Series]:
"""Select features and targets rows.
The `fit_resample` method allows for selecting the features and target
rows. The method does not resample the dataset, the naming convention ensure
the compatibility of the transformer with imbalanced-learn `Pipeline`
object.
Args:
X: Training dataset features
y: Training dataset target
Returns:
Training dataset features and target tuple.
"""
if isinstance(X, pd.DataFrame) and isinstance(y, pd.Series):
mask = X[self.variable].isin(self.category)
XR = X[mask].copy()
yr = y[mask].copy()
else:
raise TypeError(
f"{self.__class__.__name__} fit_resample methods expect pd.DataFrame and\
pd.Series as inputs."
)
return XR, yr
[docs]class Imputer(SimpleImputer):
"""Impute missing values.
The `Imputer` transformer wraps scikit-learn SimpleImputer transformer.
Parameters:
missing_values: the placeholder for the missing values.
strategy: the imputation strategy (mean, median, most_frequent, constant).
fill_value: fill_value is used to replace all occurrences of missing_values (default to 0).
verbose: controls the verbosity of the imputer.
copy: If True, a copy of X will be created.
add_indicator: If True, a MissingIndicator transform will stack onto output of the imputer’s transform.
"""
def __init__(
self,
columns: list,
missing_values: Union[int, float, str] = np.nan,
strategy: str = "mean",
fill_value: float = None,
verbose: int = 0,
copy: bool = True,
add_indicator: bool = False,
) -> None:
super().__init__(
missing_values=missing_values,
strategy=strategy,
fill_value=fill_value,
verbose=verbose,
copy=copy,
add_indicator=add_indicator,
)
self.columns = columns
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> "Imputer":
"""
Fit the imputer on X.
Args:
X: Training dataset features.
y: Training dataset target.
Returns:
Transformer.
"""
X, y = check_xy(X[self.columns], y)
super().fit(X, y)
return self
[docs]class FeatureSelector(BaseEstimator, TransformerMixin):
"""Select features correlated to the target.
Select features with correlation to the target above the threshold.
Parameters:
exclude: column to exclude from correlation calculation.
method: correlation matrix calculation method.
threshold: columns on which to add lags
"""
def __init__(
self, exclude: List[str], method: str = "pearson", threshold: float = 0.15
) -> None:
self.exclude = exclude
self.method = method
self.threshold = threshold
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> "FeatureSelector":
"""Fit the FeatureSelector on X.
Compute the correlation matrix.
Args:
X: Training dataset features.
y: Training dataset target.
Returns:
Transformer.
"""
X, y = check_xy(X, y)
self.target_correlation = (
pd.concat([X, y], axis=1)
.corr(method=self.method)
.loc[y.name]
.apply(abs)
.sort_values(ascending=False)
)
self.target_correlation = self.target_correlation[
self.target_correlation.index != y.name
]
return self
[docs]class FeatureSubsetter(BaseEstimator, TransformerMixin):
"""Subset dataframe's column.
Subset any given of the dataframe.
Parameters:
threshold: columns on which to add lags
"""
def __init__(self, columns: List[str]) -> None:
self.columns = columns
[docs] def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None) -> "FeatureSubsetter":
"""Comply with pipeline requirements.
The method does not fit the dataset, the naming convention ensure
the compatibility of the transformer with scikit-learn `Pipeline`
object.
Args:
X: Training dataset features.
y: Training dataset target.
Returns:
Transformer.
"""
X, y = check_xy(X, y)
return self