Module skplumber.tuners.ga
Expand source code
import typing as t
from collections import defaultdict
import copy
import pandas as pd
from flexga import flexga
from flexga.utils import inverted
from flexga.argmeta import (
ArgMeta,
FloatArgMeta,
IntArgMeta,
BoolArgMeta,
CategoricalArgMeta,
)
from skplumber.pipeline import Pipeline
from skplumber.metrics import Metric
from skplumber.primitives.parammeta import (
IntParamMeta,
FloatParamMeta,
BoolParamMeta,
CategoricalParamMeta,
)
from skplumber.utils import logger, PipelineRunError
from skplumber.consts import OptimizationDirection
from skplumber.tuners.utils import TuneResult
def _range_rule(lbound, ubound) -> float:
"""
Uses a modified version (dividing by 10 instead of 4) of
the range rule heuristic to provide a very rough estimate
of what a good standard deviation could be for a normal
distribution to sample genetic mutations from for a
hyperparameter having the range (lbound, ubound).
"""
return (ubound - lbound) / 10
def _get_flexga_metas(pipeline: Pipeline, X: pd.DataFrame) -> t.Dict[str, ArgMeta]:
"""
Converts meta information about the hyperparameters
of a pipeline's primitive steps to the format the `flexga`
package uses to know the bounds and characteristics of
those hyperparameters (the things `flexga` is optimizing).
"""
param_metas = pipeline.param_metas_with_data(X)
kwargsmeta = {}
for i, step_pmetas in param_metas.items():
for key, pmeta in step_pmetas.items():
flexga_key = f"{i},{key}"
if isinstance(pmeta, IntParamMeta):
flexga_arg_meta = IntArgMeta(
(pmeta.lbound, pmeta.ubound),
_range_rule(pmeta.lbound, pmeta.ubound),
)
elif isinstance(pmeta, FloatParamMeta):
flexga_arg_meta = FloatArgMeta(
(pmeta.lbound, pmeta.ubound),
_range_rule(pmeta.lbound, pmeta.ubound),
)
elif isinstance(pmeta, BoolParamMeta):
flexga_arg_meta = BoolArgMeta()
elif isinstance(pmeta, CategoricalParamMeta):
flexga_arg_meta = CategoricalArgMeta(pmeta.options)
else:
raise ValueError(
f"unsupported ParamMeta type {type(pmeta)} for {key} param"
)
kwargsmeta[flexga_key] = flexga_arg_meta
return kwargsmeta
def _get_params_from_flexga(flexga_params: dict) -> t.Dict[int, t.Dict[str, t.Any]]:
"""
Converts flexga's flattened param dictionary to the nested
dictionary `pipeline` uses.
"""
params: t.Dict[int, t.Dict[str, t.Any]] = defaultdict(dict)
for flexga_key, value in flexga_params.items():
i, key = flexga_key.split(",")
i = int(i)
params[i][key] = value
return params
def ga_tune(
pipeline: Pipeline,
X: pd.DataFrame,
y: pd.Series,
evaluator: t.Callable,
metric: Metric,
exit_on_pipeline_error: bool = True,
**flexgakwargs,
) -> TuneResult:
"""
Performs a genetic algorithm hyperparameter tuning on `pipeline`,
returning the best score it could find and the number of evaluations
it completed. Essentially performs a `.fit` operation on the pipeline,
where the pipeine is fit with the best performing hyperparameter
configuration it could find.
Returns
-------
result : TuneResult
A named tuple containing data about how the tuning process went.
"""
# See what score the model gets without any tuning
starting_params = pipeline.get_params()
starting_score = evaluator(pipeline, X, y, metric)
# keep track of how many iterations were completed
n_evals = 1 # we already completed one
def objective(*args, **flexga_params) -> float:
"""
The objective function the genetic algorithm will
try to maximize.
"""
params = _get_params_from_flexga(flexga_params)
nonlocal n_evals
try:
pipeline.set_params(params)
score = evaluator(pipeline, X, y, metric)
except PipelineRunError as e:
logger.exception(e)
if exit_on_pipeline_error:
raise e
# Pipelines that make errors are bad.
# TODO: make this `None` or `np.nan` instead.
score = metric.worst_value
n_evals += 1
# The genetic algorithm tries to maximize
return -score if metric.opt_dir == OptimizationDirection.MINIMIZE else score
# Use flexga to find the best hyperparameter configuration it can.
optimal_score, _, optimal_flexga_params = flexga(
objective, kwargsmeta=_get_flexga_metas(pipeline, X), **flexgakwargs
)
if metric.is_better_than(optimal_score, starting_score):
optimal_params = _get_params_from_flexga(optimal_flexga_params)
did_improve = True
else:
# The tuner couldn't find anything better than the params the
# pipeline started with under the conditions given.
optimal_score = starting_score
optimal_params = starting_params
did_improve = False
pipeline.set_params(optimal_params)
pipeline.fit(X, y)
logger.info("tuning complete.")
logger.info(f"found best pipeline configuration: {pipeline}")
logger.info(f"found best validation score of {optimal_score}")
return TuneResult(optimal_score, n_evals, did_improve)
Functions
def ga_tune(pipeline: Pipeline, X: pandas.core.frame.DataFrame, y: pandas.core.series.Series, evaluator: Callable, metric: Metric, exit_on_pipeline_error: bool = True, **flexgakwargs) ‑> TuneResult
-
Performs a genetic algorithm hyperparameter tuning on
pipeline
, returning the best score it could find and the number of evaluations it completed. Essentially performs a.fit
operation on the pipeline, where the pipeine is fit with the best performing hyperparameter configuration it could find.Returns
result
:TuneResult
- A named tuple containing data about how the tuning process went.
Expand source code
def ga_tune( pipeline: Pipeline, X: pd.DataFrame, y: pd.Series, evaluator: t.Callable, metric: Metric, exit_on_pipeline_error: bool = True, **flexgakwargs, ) -> TuneResult: """ Performs a genetic algorithm hyperparameter tuning on `pipeline`, returning the best score it could find and the number of evaluations it completed. Essentially performs a `.fit` operation on the pipeline, where the pipeine is fit with the best performing hyperparameter configuration it could find. Returns ------- result : TuneResult A named tuple containing data about how the tuning process went. """ # See what score the model gets without any tuning starting_params = pipeline.get_params() starting_score = evaluator(pipeline, X, y, metric) # keep track of how many iterations were completed n_evals = 1 # we already completed one def objective(*args, **flexga_params) -> float: """ The objective function the genetic algorithm will try to maximize. """ params = _get_params_from_flexga(flexga_params) nonlocal n_evals try: pipeline.set_params(params) score = evaluator(pipeline, X, y, metric) except PipelineRunError as e: logger.exception(e) if exit_on_pipeline_error: raise e # Pipelines that make errors are bad. # TODO: make this `None` or `np.nan` instead. score = metric.worst_value n_evals += 1 # The genetic algorithm tries to maximize return -score if metric.opt_dir == OptimizationDirection.MINIMIZE else score # Use flexga to find the best hyperparameter configuration it can. optimal_score, _, optimal_flexga_params = flexga( objective, kwargsmeta=_get_flexga_metas(pipeline, X), **flexgakwargs ) if metric.is_better_than(optimal_score, starting_score): optimal_params = _get_params_from_flexga(optimal_flexga_params) did_improve = True else: # The tuner couldn't find anything better than the params the # pipeline started with under the conditions given. optimal_score = starting_score optimal_params = starting_params did_improve = False pipeline.set_params(optimal_params) pipeline.fit(X, y) logger.info("tuning complete.") logger.info(f"found best pipeline configuration: {pipeline}") logger.info(f"found best validation score of {optimal_score}") return TuneResult(optimal_score, n_evals, did_improve)