Module skplumber.primitives.sk_primitives.transformers

Expand source code
import typing as t

from sklearn.preprocessing import (
    MinMaxScaler,
    PolynomialFeatures,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
)
from sklearn.feature_selection import (
    SelectPercentile,
    SelectKBest,
    SelectFpr,
    SelectFwe,
    VarianceThreshold,
)
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.manifold import (
    Isomap,
    LocallyLinearEmbedding,
    MDS,
    SpectralEmbedding,
    TSNE,
)
from sklearn.base import BaseEstimator

from skplumber.primitives.primitive import make_sklearn_primitive
from skplumber.consts import PrimitiveType
from skplumber.primitives.parammeta import (
    ParamMeta,
    BoolParamMeta,
    IntParamMeta,
    FloatParamMeta,
    CategoricalParamMeta,
)

_transformers: t.List[t.Tuple[BaseEstimator, t.Dict[str, ParamMeta]]] = [
    (MinMaxScaler, {}),
    (
        PolynomialFeatures,
        {"interaction_only": BoolParamMeta(), "include_bias": BoolParamMeta()},
    ),
    (
        QuantileTransformer,
        {
            "n_quantiles": IntParamMeta(2, lambda X: X.shape[0]),
            "output_distribution": CategoricalParamMeta(["uniform", "normal"]),
        },
    ),
    (RobustScaler, {},),
    (StandardScaler, {}),
    (SelectPercentile, {"percentile": IntParamMeta(10, 100)}),
    # Default hyperparameters don't always work
    # (SelectKBest, {}),
    (SelectFpr, {"alpha": FloatParamMeta(0.05, 1.0)}),
    (SelectFwe, {"alpha": FloatParamMeta(0.05, 1.0)}),
    (
        VarianceThreshold,
        # The range is from 0 (no variance) to about everything but the
        # column in `X` with highest variance.
        {"threshold": FloatParamMeta(0.0, lambda X: X.var().max() - 1e-10)},
    ),
    (
        PCA,
        {
            "n_components": IntParamMeta(1, lambda X: min(X.shape)),
            "whiten": BoolParamMeta(),
        },
    ),
    (
        RandomTreesEmbedding,
        {
            "n_estimators": IntParamMeta(2, 200),
            "max_depth": IntParamMeta(1, lambda X: X.shape[0] // 2),
            "min_samples_split": IntParamMeta(2, lambda X: X.shape[0]),
            # TODO: more
        },
    ),
    (
        Isomap,
        {
            "n_neighbors": IntParamMeta(1, lambda X: X.shape[0] // 2),
            "n_components": IntParamMeta(1, lambda X: min(X.shape)),
            "metric": CategoricalParamMeta(
                [
                    "cosine",
                    "euclidean",
                    "manhattan",
                    "braycurtis",
                    "canberra",
                    "chebyshev",
                    "correlation",
                ]
            ),
        },
    ),
    (
        LocallyLinearEmbedding,
        {
            "n_neighbors": IntParamMeta(1, lambda X: X.shape[0] // 2),
            "n_components": IntParamMeta(1, lambda X: min(X.shape)),
            "reg": FloatParamMeta(0.0, 1e4),
        },
    ),
    # These have no `transform` method
    # (MDS, {}),
    # (SpectralEmbedding, {}),
    # (TSNE, {}),
]

transformers = {}
for est, param_metas in _transformers:
    primitive = make_sklearn_primitive(est, PrimitiveType.TRANSFORMER, param_metas)
    transformers[primitive.__name__] = primitive