Module `skplumber.primitives.custom_primitives.preprocessing`

Expand source code

import typing as t

import pandas as pd
import numpy as np

from skplumber.primitives.primitive import Primitive
from skplumber.consts import PrimitiveType


class OneHotEncoder(Primitive):
    """
    One-hot encodes any `object` or category` columns. If the number of
    unique values is large, it just encodes the most common ones. `nan` values
    are not encoded. This primitive is heavily inspired by USC ISI's DSBOX
    encoder primitive used in the D3M ecosystem. See:
    https://github.com/usc-isi-i2/dsbox-primitives
    """

    primitive_type = PrimitiveType.PREPROCESSOR
    param_metas = {}

    def __init__(self, top_n: int = 10) -> None:
        # the max number of most common values to
        # one-hot encode for each column
        self.top_n = top_n
        self.onehot_col_names_to_vals: t.Dict[str, pd.Series] = {}

    def fit(self, X, y) -> None:
        # We could be fitting on a new dataset with different columns,
        # so forget everything from the last dataset.
        self.onehot_col_names_to_vals.clear()

        # Get the categorical columns
        categoricals = X.select_dtypes(include=["object", "category"])
        for col_name in categoricals.columns:
            # Get the `self.top_n` values that occur most frequently
            # in the column.
            top_n_vals = pd.Series(
                categoricals[col_name].value_counts().nlargest(self.top_n).index
            )
            self.onehot_col_names_to_vals[col_name] = top_n_vals

    def produce(self, X):
        if len(self.onehot_col_names_to_vals) == 0:
            # This dataset does not need one hot encoding
            return X

        # Use pd.get_dummies() to do the encoding then only keep columns
        # who are found in the map created in `self.fit`.
        categoricals = X.select_dtypes(include=["object", "category"])
        one_hotted = pd.get_dummies(categoricals)
        result = X.copy()

        for col_name, vals_to_onehot in self.onehot_col_names_to_vals.items():
            # get rid of the un-encoded column, then add the
            # one-hot encoded ones, only adding the ones that
            # were created in `self.fit`.
            result = result.drop(col_name, axis=1)
            for val in vals_to_onehot:
                onehot_col_name = f"{col_name}_{val}"
                if onehot_col_name in one_hotted.columns:
                    result[onehot_col_name] = one_hotted[onehot_col_name]
                else:
                    result[onehot_col_name] = 0

        return result


class RandomImputer(Primitive):
    """
    Imputes missing values for each column by randomly sampling
    from the known values of that column. Has the benefit of
    preserving the column's distribution.
    """

    primitive_type = PrimitiveType.PREPROCESSOR
    param_metas = {}

    def __init__(self) -> None:
        self.col_names_to_known_vals: t.Dict[str, pd.Series] = {}
        self.cols_to_drop: t.Set[str] = set()

    def fit(self, X, y) -> None:
        # We could be fitting on a new dataset with different columns,
        # so forget everything from the last dataset.
        self.col_names_to_known_vals.clear()
        self.cols_to_drop.clear()

        for col in X:
            if X[col].isna().all():
                # This column has no values, so we won't have any values
                # to sample from when imputing values for it. So we drop
                # it at produce time.
                self.cols_to_drop.add(col)
                continue

            # The index of a series returned by `pd.Series.value_counts`
            # holds the values, and the actual entries of the series hold
            # the proportions those values have in `X`.
            self.col_names_to_known_vals[col] = X[col].value_counts(normalize=True)

    def produce(self, X):
        result = X.copy()
        result.drop(self.cols_to_drop, axis="columns", inplace=True)

        # Impute missing values using the known values found
        # in `self.fit`
        for col, known_vals in self.col_names_to_known_vals.items():

            # Fill all missing values with values sampled from the
            # distribution observed for this column in the `self.fit`
            # method.
            fill_vals = pd.Series(
                np.random.choice(known_vals.index, p=known_vals, size=len(result.index))
            )

            # The indices of fill_vals and result need to match so
            # every NaN in result can have a companion value in
            # `fill_vals` to be filled with.
            fill_vals.index = result.index
            result[col].fillna(
                fill_vals, inplace=True,
            )

        return result

Classes

class OneHotEncoder (top_n: int = 10)

One-hot encodes any object or category` columns. If the number of unique values is large, it just encodes the most common ones. nan values are not encoded. This primitive is heavily inspired by USC ISI's DSBOX encoder primitive used in the D3M ecosystem. See: https://github.com/usc-isi-i2/dsbox-primitives

Expand source code

class OneHotEncoder(Primitive):
    """
    One-hot encodes any `object` or category` columns. If the number of
    unique values is large, it just encodes the most common ones. `nan` values
    are not encoded. This primitive is heavily inspired by USC ISI's DSBOX
    encoder primitive used in the D3M ecosystem. See:
    https://github.com/usc-isi-i2/dsbox-primitives
    """

    primitive_type = PrimitiveType.PREPROCESSOR
    param_metas = {}

    def __init__(self, top_n: int = 10) -> None:
        # the max number of most common values to
        # one-hot encode for each column
        self.top_n = top_n
        self.onehot_col_names_to_vals: t.Dict[str, pd.Series] = {}

    def fit(self, X, y) -> None:
        # We could be fitting on a new dataset with different columns,
        # so forget everything from the last dataset.
        self.onehot_col_names_to_vals.clear()

        # Get the categorical columns
        categoricals = X.select_dtypes(include=["object", "category"])
        for col_name in categoricals.columns:
            # Get the `self.top_n` values that occur most frequently
            # in the column.
            top_n_vals = pd.Series(
                categoricals[col_name].value_counts().nlargest(self.top_n).index
            )
            self.onehot_col_names_to_vals[col_name] = top_n_vals

    def produce(self, X):
        if len(self.onehot_col_names_to_vals) == 0:
            # This dataset does not need one hot encoding
            return X

        # Use pd.get_dummies() to do the encoding then only keep columns
        # who are found in the map created in `self.fit`.
        categoricals = X.select_dtypes(include=["object", "category"])
        one_hotted = pd.get_dummies(categoricals)
        result = X.copy()

        for col_name, vals_to_onehot in self.onehot_col_names_to_vals.items():
            # get rid of the un-encoded column, then add the
            # one-hot encoded ones, only adding the ones that
            # were created in `self.fit`.
            result = result.drop(col_name, axis=1)
            for val in vals_to_onehot:
                onehot_col_name = f"{col_name}_{val}"
                if onehot_col_name in one_hotted.columns:
                    result[onehot_col_name] = one_hotted[onehot_col_name]
                else:
                    result[onehot_col_name] = 0

        return result

Ancestors

Primitive
abc.ABC

Class variables

var param_metas
var primitive_type

Methods

def fit(self, X, y) ‑> NoneType

Expand source code

def fit(self, X, y) -> None:
    # We could be fitting on a new dataset with different columns,
    # so forget everything from the last dataset.
    self.onehot_col_names_to_vals.clear()

    # Get the categorical columns
    categoricals = X.select_dtypes(include=["object", "category"])
    for col_name in categoricals.columns:
        # Get the `self.top_n` values that occur most frequently
        # in the column.
        top_n_vals = pd.Series(
            categoricals[col_name].value_counts().nlargest(self.top_n).index
        )
        self.onehot_col_names_to_vals[col_name] = top_n_vals

def produce(self, X)

Expand source code

def produce(self, X):
    if len(self.onehot_col_names_to_vals) == 0:
        # This dataset does not need one hot encoding
        return X

    # Use pd.get_dummies() to do the encoding then only keep columns
    # who are found in the map created in `self.fit`.
    categoricals = X.select_dtypes(include=["object", "category"])
    one_hotted = pd.get_dummies(categoricals)
    result = X.copy()

    for col_name, vals_to_onehot in self.onehot_col_names_to_vals.items():
        # get rid of the un-encoded column, then add the
        # one-hot encoded ones, only adding the ones that
        # were created in `self.fit`.
        result = result.drop(col_name, axis=1)
        for val in vals_to_onehot:
            onehot_col_name = f"{col_name}_{val}"
            if onehot_col_name in one_hotted.columns:
                result[onehot_col_name] = one_hotted[onehot_col_name]
            else:
                result[onehot_col_name] = 0

    return result

Inherited members

Primitive:
- get_params

class RandomImputer

Imputes missing values for each column by randomly sampling from the known values of that column. Has the benefit of preserving the column's distribution.

Expand source code

class RandomImputer(Primitive):
    """
    Imputes missing values for each column by randomly sampling
    from the known values of that column. Has the benefit of
    preserving the column's distribution.
    """

    primitive_type = PrimitiveType.PREPROCESSOR
    param_metas = {}

    def __init__(self) -> None:
        self.col_names_to_known_vals: t.Dict[str, pd.Series] = {}
        self.cols_to_drop: t.Set[str] = set()

    def fit(self, X, y) -> None:
        # We could be fitting on a new dataset with different columns,
        # so forget everything from the last dataset.
        self.col_names_to_known_vals.clear()
        self.cols_to_drop.clear()

        for col in X:
            if X[col].isna().all():
                # This column has no values, so we won't have any values
                # to sample from when imputing values for it. So we drop
                # it at produce time.
                self.cols_to_drop.add(col)
                continue

            # The index of a series returned by `pd.Series.value_counts`
            # holds the values, and the actual entries of the series hold
            # the proportions those values have in `X`.
            self.col_names_to_known_vals[col] = X[col].value_counts(normalize=True)

    def produce(self, X):
        result = X.copy()
        result.drop(self.cols_to_drop, axis="columns", inplace=True)

        # Impute missing values using the known values found
        # in `self.fit`
        for col, known_vals in self.col_names_to_known_vals.items():

            # Fill all missing values with values sampled from the
            # distribution observed for this column in the `self.fit`
            # method.
            fill_vals = pd.Series(
                np.random.choice(known_vals.index, p=known_vals, size=len(result.index))
            )

            # The indices of fill_vals and result need to match so
            # every NaN in result can have a companion value in
            # `fill_vals` to be filled with.
            fill_vals.index = result.index
            result[col].fillna(
                fill_vals, inplace=True,
            )

        return result

Ancestors

Primitive
abc.ABC

Class variables

var param_metas
var primitive_type

Methods

def fit(self, X, y) ‑> NoneType

Expand source code

def fit(self, X, y) -> None:
    # We could be fitting on a new dataset with different columns,
    # so forget everything from the last dataset.
    self.col_names_to_known_vals.clear()
    self.cols_to_drop.clear()

    for col in X:
        if X[col].isna().all():
            # This column has no values, so we won't have any values
            # to sample from when imputing values for it. So we drop
            # it at produce time.
            self.cols_to_drop.add(col)
            continue

        # The index of a series returned by `pd.Series.value_counts`
        # holds the values, and the actual entries of the series hold
        # the proportions those values have in `X`.
        self.col_names_to_known_vals[col] = X[col].value_counts(normalize=True)

def produce(self, X)

Expand source code

def produce(self, X):
    result = X.copy()
    result.drop(self.cols_to_drop, axis="columns", inplace=True)

    # Impute missing values using the known values found
    # in `self.fit`
    for col, known_vals in self.col_names_to_known_vals.items():

        # Fill all missing values with values sampled from the
        # distribution observed for this column in the `self.fit`
        # method.
        fill_vals = pd.Series(
            np.random.choice(known_vals.index, p=known_vals, size=len(result.index))
        )

        # The indices of fill_vals and result need to match so
        # every NaN in result can have a companion value in
        # `fill_vals` to be filled with.
        fill_vals.index = result.index
        result[col].fillna(
            fill_vals, inplace=True,
        )

    return result

Inherited members

Primitive:
- get_params