Source code for paddlets.automl.search_space_configer

# !/usr/bin/env python3
# -*- coding:utf-8 -*-

import copy
import json

from ray.tune import uniform, quniform, loguniform, randn, randint, qrandint, lograndint, choice
from ray.tune.sample import Float, Integer, Categorical
from ray.tune.sample import Quantized, Normal

from paddlets.logger import Logger
from paddlets.pipeline import Pipeline
from paddlets.models.forecasting.dl.paddle_base import PaddleBaseModel
from paddlets.models.forecasting.ml.ml_base import MLBaseModel
from paddlets.transform.base import BaseTransform

logger = Logger(__name__)

USER_DEFINED_SEARCH_SPACE = "USER_DEFINED_SEARCH_SPACE"
RAY_SAMPLE = {
    Float: {
        Float._Uniform: uniform.__name__,
        Float._Normal: randn.__name__,
        Float._LogUniform: loguniform.__name__,
    },
    Integer: {
        Integer._Uniform: randint.__name__,
        Integer._LogUniform: lograndint.__name__,
    },
    Categorical: {
        Categorical._Uniform: choice.__name__
    }
}


[docs]class SearchSpaceConfiger:
    """
    SearchSpaceConfiger is for getting the default search space for the paddlets transformer, paddlets model, or paddlets pipeline used
    by automl.

    """

[docs]    def get_default_search_space(self, paddlets_estimator):
        """

        Args:
            paddlets_estimator: A class(or str) of a paddlets model or a list of classes(or str) consisting of several paddlets
                transformers and a paddlets model.

        Returns:
            dict: The domain of the automl to be optimized.

        """
        if hasattr(paddlets_estimator, "__mro__") and BaseTransform in paddlets_estimator.__mro__:
            for class_name, config in self.paddlets_default_search_space["transform"].items():
                if paddlets_estimator.__name__ == class_name:
                    return config
        elif hasattr(paddlets_estimator, "__mro__") and PaddleBaseModel in paddlets_estimator.__mro__:
            for class_name, config in self.paddlets_default_search_space["models"]["dl"]["paddlepaddle"].items():
                if paddlets_estimator.__name__ == class_name:
                    return config
        elif hasattr(paddlets_estimator, "__mro__") and MLBaseModel in paddlets_estimator.__mro__:
            for class_name, config in self.paddlets_default_search_space["models"]["ml"].items():
                if paddlets_estimator.__name__ == class_name:
                    return config
        elif isinstance(paddlets_estimator, Pipeline):
            config_dict = {}
            for step in paddlets_estimator.steps:
                e = step[0]
                founded = False
                estimator_index = 0
                for class_name, config in {**self.paddlets_default_search_space["transform"],
                                           **self.paddlets_default_search_space["models"]["dl"]["paddlepaddle"],
                                           **self.paddlets_default_search_space["models"]["ml"]}.items():
                    if e.__name__ == class_name:
                        config_dict[e.__name__ + "-" + str(estimator_index)] = config
                        founded = True
                        break
                if founded is False:
                    config_dict[e.__name__ + "-" + str(estimator_index)] = {}
                estimator_index = estimator_index + 1
            if self._sp_empty(config_dict):
                raise NotImplementedError(f"search space is empty, sp: {self.search_space_to_str(config_dict)}")
            return config_dict
        elif isinstance(paddlets_estimator, str):
            for class_name, config in {**self.paddlets_default_search_space["transform"],
                                       **self.paddlets_default_search_space["models"]["dl"]["paddlepaddle"],
                                       **self.paddlets_default_search_space["models"]["ml"]}.items():
                if paddlets_estimator == class_name:
                    return config
        elif isinstance(paddlets_estimator, list):
            config_dict = {}
            estimator_index = 0
            for e in paddlets_estimator:
                founded = False
                for class_name, config in {**self.paddlets_default_search_space["transform"],
                                           **self.paddlets_default_search_space["models"]["dl"]["paddlepaddle"],
                                           **self.paddlets_default_search_space["models"]["ml"]}.items():
                    if isinstance(e, str) and e == class_name:
                        config_dict[e + "-" + str(estimator_index)] = config
                        founded = True
                        break
                    elif not isinstance(e, str) and e.__name__ == class_name:
                        config_dict[e.__name__ + "-" + str(estimator_index)] = config
                        founded = True
                        break
                if founded is False:
                    if isinstance(e, str):
                        config_dict[e + "-" + str(estimator_index)] = {}
                    else:
                        config_dict[e.__name__ + "-" + str(estimator_index)] = {}
                estimator_index = estimator_index + 1
            if self._sp_empty(config_dict):
                raise NotImplementedError(f"search space is empty, sp: {self.search_space_to_str(config_dict)}")
            return config_dict
        # paddlets_estimator is unknown type
        raise NotImplementedError("Unknown estimator")

[docs]    def recommend(self, estimator, verbose=True):
        """
        Recommend a search space for the paddlets estimator.

        Args:
            estimator: A class(or str) of a paddlets model or a list of classes(or str) consisting of several paddlets
                transformers and a paddlets model.

        Returns:
            str: Search space in form of str

        """
        recommended_sp = self.get_default_search_space(estimator)
        recommended_sp = self.search_space_to_str(recommended_sp)
        res = "The recommended search space are as follows: \n" \
              "=======================================================\n" \
              "from ray.tune import uniform, quniform, loguniform, qloguniform, " \
              "randn, qrandn, randint, qrandint, lograndint, qlograndint, choice\n" \
              "recommended_sp = " \
              + recommended_sp + \
              "\n=====================================================\n" \
              "Please note that the **USER_DEFINED_SEARCH_SPACE** " \
              "parameters need to be set by the user\n"
        if verbose:
            logger.info(res)
        return res

[docs]    def search_space_to_str(self, search_space):
        """
        Convert search space to string

        Args:
            search_space: A class(or str) of a paddlets model or a list of classes(or str) consisting of several paddlets
                transformers and a paddlets model.

        Returns:
            str: Search space in form of str

        """
        res = copy.deepcopy(search_space)
        self._dfs_search_space_to_str(res)
        return self._to_pretty_str(res)

    def _dfs_search_space_to_str(self, search_space):
        """

        _dfs_search_space_to_str

        """
        for e, sp in search_space.items():
            if isinstance(sp, dict):
                search_space[e] = self._dfs_search_space_to_str(search_space[e])
            else:
                search_space[e] = self._param_search_space_to_str(sp)
        return search_space

    def _param_search_space_to_str(self, sp):
        """

        _param_search_space_to_str

        """
        if not hasattr(sp, "sampler"):
            if isinstance(sp, str) and sp == USER_DEFINED_SEARCH_SPACE:
                return "**" + USER_DEFINED_SEARCH_SPACE + "**"
            else:
                # may throw an exception
                return f"{sp}"
        elif isinstance(sp.sampler, Quantized):
            return "q" + RAY_SAMPLE[sp.__class__][sp.sampler.sampler.__class__] \
                   + "(" + str(sp.lower) + ", " + str(sp.upper) + ", q=" + str(sp.sampler.q) + ")"
        else:
            if isinstance(sp, Categorical):
                return RAY_SAMPLE[sp.__class__][sp.sampler.__class__] + "(" + sp.domain_str + ")"
            else:
                if isinstance(sp.sampler, Normal):
                    return RAY_SAMPLE[sp.__class__][sp.sampler.__class__] + "(" + str(sp.sampler.mean) + ", " + str(
                        sp.sampler.sd) + ")"
                else:
                    return RAY_SAMPLE[sp.__class__][sp.sampler.__class__] + "(" + str(sp.lower) + ", " + str(
                        sp.upper) + ")"

    def _to_pretty_str(self, sp_dict):
        """

        _to_pretty_str

        """

        return json.dumps(sp_dict, indent="\t").replace('",', ",").replace(': "', ": ").replace('"\n', '\n')

    @property
    def paddlets_default_search_space(self):
        """

        Default search space for paddlets

        """
        return {
            "transform": {
                "Fill": {
                    "cols": USER_DEFINED_SEARCH_SPACE,
                    "method": choice(['max', 'min', 'mean', 'median', 'pre', 'next', 'zero']),
                    "window_size": lograndint(10, 30),
                    "min_num_non_missing_values": lograndint(1, 10),
                },
                "OneHot": {
                    "cols": USER_DEFINED_SEARCH_SPACE
                },
                "Ordinal": {
                    "cols": USER_DEFINED_SEARCH_SPACE
                },
                "StatsTransform": {
                    "cols": USER_DEFINED_SEARCH_SPACE,
                    "start": 0,
                    "end": 5,
                },
                "MinMaxScaler": {
                    "col": USER_DEFINED_SEARCH_SPACE,
                    "clip": choice([True, False]),
                },
                "StandardScaler": {
                    "col": USER_DEFINED_SEARCH_SPACE,
                },
                "KSigma": {
                    "cols": USER_DEFINED_SEARCH_SPACE,
                    "k": quniform(0.5, 10, q=0.5)
                },
                "TimeFeatureGenerator": {}
            },
            "models": {
                "dl": {
                    "paddlepaddle": {
                        "MLPRegressor": {
                            "hidden_config": choice([[64], [64] * 2, [64] * 3, [128], [128] * 2, [128] * 3]),
                            "use_bn": choice([True, False]),
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "patience": qrandint(5, 50, q=5)
                        },
                        "RNNBlockRegressor": {
                            "rnn_type_or_module": choice(["SimpleRNN", "LSTM", "GRU"]),
                            "fcn_out_config": choice([[16], [32], [64], [128], [256],
                                                      [16] * 2, [32] * 2, [64] * 2, [128] * 2, [256] * 2,
                                                      [16] * 3, [32] * 3, [64] * 3, [128] * 3,
                                                      [256] * 3]),
                            "hidden_size": qrandint(32, 512, q=32),
                            "num_layers_recurrent": randint(1, 4),
                            "dropout": quniform(0, 0.5, q=0.05),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "patience": qrandint(5, 50, q=5)
                        },
                        "NBEATSModel": {
                            "generic_architecture": choice([True, False]),
                            "num_stacks": randint(2, 6),
                            "num_blocks": randint(2, 6),
                            "num_layers": randint(1, 6),
                            "layer_widths": qrandint(32, 512, q=32),
                            "expansion_coefficient_dim": qrandint(32, 512, q=32),
                            "trend_polynomial_degree": randint(2, 6),
                            "optimizer_params":{
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(300, 1000, q=100),
                            "patience": qrandint(50, 100, q=5)
                        },
                        "NHiTSModel": {
                            "num_stacks": randint(2, 6),
                            "num_blocks": randint(2, 6),
                            "num_layers": randint(2, 6),
                            "layer_widths": 512,
                            "batch_norm": choice([True, False]),
                            "dropout": quniform(0, 0.5, 0.05),
                            "activation": choice(
                                ["ReLU", "PReLU", "ELU", "Softplus", "Tanh", "SELU", "LeakyReLU", "Sigmoid",
                                 "GELU"]),
                            "MaxPool1d": choice([True, False]),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "verbose": 1,
                            "patience": qrandint(5, 50, q=5)
                        },
                        "LSTNetRegressor": {
                            "skip_size": 1,
                            "channels": choice([1, 2, 4, 8, 16, 32, 64]),
                            "kernel_size": choice([1, 3, 7]),
                            "rnn_cell_type": choice(["GRU", "LSTM"]),
                            "rnn_num_cells": choice([1, 2, 4, 8, 16, 32, 64]),
                            "skip_rnn_cell_type": choice(["GRU", "LSTM"]),
                            "skip_rnn_num_cells": choice([1, 2, 4, 8, 16, 32, 64]),
                            "dropout_rate": quniform(0, 0.5, 0.05),
                            "output_activation": None,
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "patience": qrandint(5, 50, q=5)
                        },
                        "TransformerModel": {
                            "nhead": choice([1, 2, 4, 8]),
                            "num_encoder_layers": randint(1, 11),
                            "num_decoder_layers": randint(1, 11),
                            "dim_feedforward": qrandint(32, 512, q=32),
                            "activation": choice(["relu", "gelu"]),
                            "dropout_rate": quniform(0, 0.5, q=0.05),
                            "d_model": qrandint(32, 512, q=32),
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "patience": qrandint(5, 50, q=5)
                        },
                        "TCNRegressor": {
                            "hidden_config": choice([[64], [64] * 2, [64] * 3, [128] * 2, [128] * 3,
                                                     [8] * 3, [8] * 5, [8] * 7,
                                                     [16] * 3, [16] * 5, [16] * 7]),
                            "kernel_size": choice([3, 5, 7]),
                            "dropout_rate": quniform(0, 0.5, 0.05),
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(300, 1500, q=100),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "patience": qrandint(50, 150, q=10)
                        },
                        "InformerModel": {
                            "nhead": choice([1, 2, 4, 8]),
                            "num_encoder_layers": randint(1, 11),
                            "num_decoder_layers": randint(1, 11),

                            "activation": choice(["relu", "gelu"]),
                            "dropout_rate": quniform(0, 0.5, q=0.05),
                            "d_model": qrandint(32, 512, q=32),
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "optimizer_params": {
                                "learning_rate": uniform(1e-4, 1e-2)
                            },
                            "patience": qrandint(5, 50, q=5)
                        },
                        "DeepARModel": {
                            "rnn_type_or_module": choice(["LSTM", "GRU"]),
                            "fcn_out_config": choice([[16], [32], [64], [128], [256],
                                                      [16] * 2, [32] * 2, [64] * 2, [128] * 2, [256] * 2,
                                                      [16] * 3, [32] * 3, [64] * 3, [128] * 3,
                                                      [256] * 3]),
                            "hidden_size": qrandint(32, 512, q=32),
                            "num_layers_recurrent": randint(1, 4),
                            "dropout": quniform(0, 0.5, q=0.05),
                            "optimizer_params": {"learning_rate": uniform(1e-4, 1e-2)},
                            "batch_size": qrandint(8, 128, q=8),
                            "max_epochs": qrandint(30, 600, q=30),
                            "patience": qrandint(5, 50, q=5)
                        }
                    },
                    "pytorch": {}
                },
                "ml": {
                    "LGBM": {
                        "num_boost_round": randint(10, 2000),
                        "early_stopping_rounds": 0,
                        "params": {
                            "boosting": choice(["gbdt", "rf", "dart"]),
                            "objective": "regression",
                            "metric": choice(["mse", "mae"]),
                            "learning_rate": loguniform(1e-4, 0.1),
                            "lambda_l1": quniform(0, 0.3, 0.05),
                            "lambda_l2": quniform(0, 0.3, 0.05),
                            "num_leaves": randint(15, 255),
                            "max_depth": -1,
                            "bagging_freq": randint(1, 6),
                            "bagging_fraction": quniform(0.3, 0.95, 0.05),
                            "feature_fraction": quniform(0.3, 1, 0.1),
                            "min_data_in_leaf": randint(1, 32),
                            "verbose": -1,
                            "num_threads": 1,
                            "seed": 28,
                        },
                    },
                    "ArimaModel": {
                        "p": 0,
                        "d": 0,
                        "q": 1,
                        "trend": choice(["c", "nc"]),
                    }
                }
            }
        }

    def _sp_empty(self, sp_dict):
        """

        Check if the search space for a pipeline is empty

        """
        for k, v in sp_dict.items():
            if bool(v):
                return False
        return True