Source code for lce._lce

import math
import numbers
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

from ._lcetree import LCETreeClassifier, LCETreeRegressor


[docs]class LCEClassifier(ClassifierMixin, BaseEstimator):
    """
    A **Local Cascade Ensemble (LCE) classifier**. LCEClassifier is **compatible with scikit-learn**;
    it passes the `check_estimator <https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.check_estimator.html#sklearn.utils.estimator_checks.check_estimator>`_.
    Therefore, it can interact with scikit-learn pipelines and model selection tools.


    Parameters
    ----------
    n_estimators : int, default=10
        The number of trees in the ensemble.

    bootstrap : bool, default=True
        Whether bootstrap samples are used when building trees. If False, the
        whole dataset is used to build each tree.

    criterion : {"gini", "entropy"}, default="gini"
        The function to measure the quality of a split. Supported criteria are
        "gini" for the Gini impurity and "entropy" for the information gain.

    splitter : {"best", "random"}, default="best"
        The strategy used to choose the split at each node. Supported strategies
        are "best" to choose the best split and "random" to choose the best random
        split.

    max_depth : int, default=2
        The maximum depth of a tree.

    max_features : int, float or {"auto", "sqrt", "log"}, default=None
        The number of features to consider when looking for the best split:

        - If int, then consider `max_features` features at each split.
        - If float, then `max_features` is a fraction and
          `round(max_features * n_features)` features are considered at each
          split.
        - If "auto", then `max_features=sqrt(n_features)`.
        - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
        - If "log2", then `max_features=log2(n_features)`.
        - If None, then `max_features=n_features`.

        Note: the search for a split does not stop until at least one
        valid partition of the node samples is found, even if it requires to
        effectively inspect more than ``max_features`` features.

    max_samples : int or float, default=1.0
        The number of samples to draw from X to train each base estimator
        (with replacement by default, see ``bootstrap`` for more details).

        - If int, then draw `max_samples` samples.
        - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`.

    min_samples_leaf : int or float, default=1
        The minimum number of samples required to be at a leaf node.
        A split point at any depth will only be considered if it leaves at
        least ``min_samples_leaf`` training samples in each of the left and
        right branches.

        - If int, then consider `min_samples_leaf` as the minimum number.
        - If float, then `min_samples_leaf` is a fraction and
          `ceil(min_samples_leaf * n_samples)` are the minimum
          number of samples for each node.

    n_iter: int, default=10
        Number of iterations to set the hyperparameters of each node base
        classifier in Hyperopt.

    metric: string, default="accuracy"
        The score of the base classifier optimized by Hyperopt. Supported metrics
        are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.

    base_learner : {"catboost", "lightgbm", "xgboost"}, default="xgboost"
        The base classifier trained in each node of a tree.

    base_n_estimators : tuple, default=(10, 50, 100)
        The number of estimators of the base learner. The tuple provided is
        the search space used for the hyperparameter optimization (Hyperopt).

    base_max_depth : tuple, default=(3, 6, 9)
        Maximum tree depth for base learners. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).
        
    base_num_leaves : tuple, default=(20, 50, 100, 500)
        Maximum tree leaves (applicable to LightGBM only). The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
        `learning_rate` of the base learner. The tuple provided is the search space used for the
        hyperparameter optimization (Hyperopt).

    base_booster : ("dart", "gblinear", "gbtree"), default=("gbtree",)
        The type of booster to use (applicable to XGBoost only). "gbtree" and "dart" use tree based models
        while "gblinear" uses linear functions. The tuple provided is the search space used
        for the hyperparameter optimization (Hyperopt).
        
    base_boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",)
        The type of boosting type to use (applicable to LightGBM only): "dart" dropouts meet Multiple Additive 
        Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 
        The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).

    base_gamma : tuple, default=(0, 1, 10)
        `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction
        required to make a further partition on a leaf node of the tree.
        The larger `gamma` is, the more conservative XGBoost algorithm will be.
        The tuple provided is the search space used for the hyperparameter optimization
        (Hyperopt).

    base_min_child_weight : tuple, default=(1, 5, 15, 100)
        `min_child_weight` of base learner (applicable to LightGBM and XGBoost only). `min_child_weight` defines the
        minimum sum of instance weight (hessian) needed in a child. If the tree
        partition step results in a leaf node with the sum of instance weight
        less than `min_child_weight`, then the building process will give up further
        partitioning. The larger `min_child_weight` is, the more conservative the base learner
        algorithm will be. The tuple provided is the search space used for the hyperparameter
        optimization (Hyperopt).

    base_subsample : tuple, default=(1.0,)
        Base learner subsample ratio of the training instances (applicable to LightGBM and XGBoost only). 
        Setting it to 0.5 means that the base learner would randomly sample half of the training data prior to
        growing trees, and this will prevent overfitting. Subsampling will occur
        once in every boosting iteration. The tuple provided is the search space used for
        the hyperparameter optimization (Hyperopt).
        
    base_subsample_for_bin : tuple, default=(200000,)
        Number of samples for constructing bins (applicable to LightGBM only). The tuple provided is the
        search space used for the hyperparameter optimization (Hyperopt).

    base_colsample_bytree : tuple, default=(1.0,)
        Base learner subsample ratio of columns when constructing each tree (applicable to LightGBM and XGBoost only).
        Subsampling occurs once for every tree constructed. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_colsample_bylevel : tuple, default=(1.0,)
        Subsample ratio of columns for each level (applicable to CatBoost and XGBoost only). Subsampling occurs
        once for every new depth level reached in a tree. Columns are subsampled
        from the set of columns chosen for the current tree. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_colsample_bynode : tuple, default=(1.0,)
        Subsample ratio of columns for each node split (applicable to XGBoost only). Subsampling
        occurs once every time a new split is evaluated. Columns are subsampled
        from the set of columns chosen for the current level. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_reg_alpha : tuple, default=(0,)
        `reg_alpha` of the base learner (applicable to LightGBM and XGBoost only). 
        `reg_alpha` corresponds to the L1 regularization term on the weights. 
        Increasing this value will make the base learner more conservative. 
        The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).

    base_reg_lambda : tuple, default=(0.1, 1.0, 5.0)
        `reg_lambda` of the base learner. `reg_lambda` corresponds to the L2 regularization term 
        on the weights. Increasing this value will make the base learner more
        conservative. The tuple provided is the search space used for the hyperparameter
        optimization (Hyperopt).

    n_jobs : int, default=None
        The number of jobs to run in parallel.
        ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.

    random_state : int, RandomState instance or None, default=None
        Controls the randomness of the bootstrapping of the samples used
        when building trees (if ``bootstrap=True``), the sampling of the
        features to consider when looking for the best split at each node
        (if ``max_features < n_features``), the base classifier and
        the Hyperopt algorithm.

    verbose : int, default=0
        Controls the verbosity when fitting.

    Attributes
    ----------
    base_estimator_ : LCETreeClassifier
        The child estimator template used to create the collection of fitted
        sub-estimators.

    estimators_ : list of LCETreeClassifier
        The collection of fitted sub-estimators.

    classes_ : ndarray of shape (n_classes,) or a list of such arrays
        The classes labels.

    n_classes_ : int
        The number of classes.

    n_features_in_ : int
        The number of features when ``fit`` is performed.

    encoder_ : LabelEncoder
        The encoder to have target labels with value between 0 and n_classes-1.

    Notes
    -----
    The default values for the parameters controlling the size of the trees
    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
    unpruned trees which can potentially be very large on some data sets. To
    reduce memory consumption, the complexity and size of the trees should be
    controlled by setting those parameter values.

    The features are always randomly permuted at each split. Therefore,
    the best found split may vary, even with the same training data,
    ``max_features=n_features`` and ``bootstrap=False``, if the improvement
    of the criterion is identical for several splits enumerated during the
    search of the best split. To obtain a deterministic behaviour during
    fitting, ``random_state`` has to be fixed.

    References
    ----------
    .. [1] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. "XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification", Data Mining and Knowledge Discovery, 36(3):917-957, 2022. https://hal.inria.fr/hal-03599214/document
    """

[docs]    def __init__(
        self,
        n_estimators=10,
        bootstrap=True,
        criterion="gini",
        splitter="best",
        max_depth=2,
        max_features=None,
        max_samples=1.0,
        min_samples_leaf=1,
        n_iter=10,
        metric="accuracy",
        base_learner="xgboost",
        base_n_estimators=(10, 50, 100),
        base_max_depth=(3, 6, 9),
        base_num_leaves=(20, 50, 100, 500),
        base_learning_rate=(0.01, 0.1, 0.3, 0.5),
        base_booster=("gbtree",),
        base_boosting_type=("gbdt",),
        base_gamma=(0, 1, 10),
        base_min_child_weight=(1, 5, 15, 100),
        base_subsample=(1.0,),
        base_subsample_for_bin=(200000,),
        base_colsample_bytree=(1.0,),
        base_colsample_bylevel=(1.0,),
        base_colsample_bynode=(1.0,),
        base_reg_alpha=(0,),
        base_reg_lambda=(0.1, 1.0, 5.0),
        n_jobs=None,
        random_state=None,
        verbose=0,
    ):
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.max_features = max_features
        self.max_samples = max_samples
        self.min_samples_leaf = min_samples_leaf
        self.n_iter = n_iter
        self.metric = metric
        self.base_learner = base_learner
        self.base_n_estimators = base_n_estimators
        self.base_max_depth = base_max_depth
        self.base_num_leaves = base_num_leaves
        self.base_learning_rate = base_learning_rate
        self.base_booster = base_booster
        self.base_boosting_type = base_boosting_type
        self.base_gamma = base_gamma
        self.base_min_child_weight = base_min_child_weight
        self.base_subsample = base_subsample
        self.base_subsample_for_bin = base_subsample_for_bin
        self.base_colsample_bytree = base_colsample_bytree
        self.base_colsample_bylevel = base_colsample_bylevel
        self.base_colsample_bynode = base_colsample_bynode
        self.base_reg_alpha = base_reg_alpha
        self.base_reg_lambda = base_reg_lambda
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def _generate_estimator(self):
        """Generate an estimator."""
        est = LCETreeClassifier()
        est.n_classes_in = self.n_classes_
        est.criterion = self.criterion
        est.splitter = self.splitter
        est.max_depth = self.max_depth
        est.max_features = self.max_features
        est.min_samples_leaf = self.min_samples_leaf
        est.n_iter = self.n_iter
        est.metric = self.metric
        est.base_learner = self.base_learner
        est.base_n_estimators = self.base_n_estimators
        est.base_max_depth = self.base_max_depth
        est.base_num_leaves = self.base_num_leaves
        est.base_learning_rate = self.base_learning_rate
        est.base_booster = self.base_booster
        est.base_boosting_type = self.base_boosting_type
        est.base_gamma = self.base_gamma
        est.base_min_child_weight = self.base_min_child_weight
        est.base_subsample = self.base_subsample
        est.base_subsample_for_bin = self.base_subsample_for_bin        
        est.base_colsample_bytree = self.base_colsample_bytree        
        est.base_colsample_bylevel = self.base_colsample_bylevel
        est.base_colsample_bynode = self.base_colsample_bynode
        est.base_reg_alpha = self.base_reg_alpha
        est.base_reg_alpha = self.base_reg_lambda
        est.n_jobs = self.n_jobs
        est.random_state = self.random_state
        est.verbose = self.verbose
        return est

    def _more_tags(self):
        """Update scikit-learn estimator tags."""
        return {"allow_nan": True, "requires_y": True}

    def _validate_extra_parameters(self, X):
        """Validate parameters not already validated by methods employed."""
        # Validate max_depth
        if isinstance(self.max_depth, numbers.Integral):
            if not (0 <= self.max_depth):
                raise ValueError(
                    "max_depth must be greater than or equal to 0, "
                    "got {0}.".format(self.max_depth)
                )
        else:
            raise ValueError("max_depth must be int")

        # Validate min_samples_leaf
        if isinstance(self.min_samples_leaf, numbers.Integral):
            if not 1 <= self.min_samples_leaf:
                raise ValueError(
                    "min_samples_leaf must be at least 1 "
                    "or in (0, 0.5], got %s" % self.min_samples_leaf
                )
        elif isinstance(self.min_samples_leaf, float):
            if not 0.0 < self.min_samples_leaf <= 0.5:
                raise ValueError(
                    "min_samples_leaf must be at least 1 "
                    "or in (0, 0.5], got %s" % self.min_samples_leaf
                )
            self.min_samples_leaf = int(math.ceil(self.min_samples_leaf * X.shape[0]))
        else:
            raise ValueError("min_samples_leaf must be int or float")

        # Validate n_iter
        if isinstance(self.n_iter, numbers.Integral):
            if self.n_iter <= 0:
                raise ValueError(
                    "n_iter must be greater than 0, " "got {0}.".format(self.n_iter)
                )
        else:
            raise ValueError("n_iter must be int")

        # Validate verbose
        if isinstance(self.verbose, numbers.Integral):
            if self.verbose < 0:
                raise ValueError(
                    "verbose must be greater than or equal to 0, "
                    "got {0}.".format(self.verbose)
                )
        else:
            raise ValueError("verbose must be int")

[docs]    def fit(self, X, y):
        """
        Build a forest of LCE trees from the training set (X, y).

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like of shape (n_samples,)
            The class labels.

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, force_all_finite="allow-nan")
        check_classification_targets(y)
        self._validate_extra_parameters(X)
        self.n_features_in_ = X.shape[1]
        self.X_ = True
        self.y_ = True
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = self.classes_.size
        self.encoder_ = LabelEncoder()
        self.encoder_.fit(self.classes_)
        self.base_estimator_ = self._generate_estimator()
        self.estimators_ = BaggingClassifier(
            base_estimator=self.base_estimator_,
            n_estimators=self.n_estimators,
            bootstrap=self.bootstrap,
            max_samples=self.max_samples,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
        )
        self.estimators_.fit(X, y)
        return self

[docs]    def predict(self, X):
        """
        Predict class for X.
        The predicted class of an input sample is computed as the class with
        the highest mean predicted probability.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        y : ndarray of shape (n_samples,)
            The predicted classes.
        """
        check_is_fitted(self, ["X_", "y_"])
        X = check_array(X, force_all_finite="allow-nan")
        predictions = self.estimators_.predict(X)
        return self.encoder_.inverse_transform(predictions)

[docs]    def predict_proba(self, X):
        """
        Predict class probabilities for X.
        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the base estimators in the
        ensemble.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        y : ndarray of shape (n_samples,)
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute ``classes_``.
        """
        check_is_fitted(self, ["X_", "y_"])
        X = check_array(X, force_all_finite="allow-nan")
        return self.estimators_.predict_proba(X)

[docs]    def set_params(self, **params):
        """
        Set the parameters of the estimator.

        Parameters
        ----------
        **params : dict
            Estimator parameters.

        Returns
        -------
        self : object
        """
        if not params:
            return self

        for key, value in params.items():
            if hasattr(self, key):
                setattr(self, key, value)

        return self


[docs]class LCERegressor(RegressorMixin, BaseEstimator):
    """
    A **Local Cascade Ensemble (LCE) regressor**. LCERegressor is **compatible with scikit-learn**;
    it passes the `check_estimator <https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.check_estimator.html#sklearn.utils.estimator_checks.check_estimator>`_.
    Therefore, it can interact with scikit-learn pipelines and model selection tools.


    Parameters
    ----------
    n_estimators : int, default=10
        The number of trees in the ensemble.

    bootstrap : bool, default=True
        Whether bootstrap samples are used when building trees. If False, the
        whole dataset is used to build each tree.

    criterion : {"squared_error", "friedman_mse", "absolute_error", "poisson"}, default="squared_error"
        The function to measure the quality of a split. Supported criteria are "squared_error" for
        the mean squared error, which is equal to variance reduction as feature selection
        criterion and minimizes the L2 loss using the mean of each terminal node,
        "friedman_mse", which uses mean squared error with Friedman's improvement score
        for potential splits, "absolute_error" for the mean absolute error, which
        minimizes the L1 loss using the median of each terminal node, and "poisson"
        which uses reduction in Poisson deviance to find splits.

    splitter : {"best", "random"}, default="best"
        The strategy used to choose the split at each node. Supported strategies
        are "best" to choose the best split and "random" to choose the best random
        split.

    max_depth : int, default=2
        The maximum depth of a tree.

    max_features : int, float or {"auto", "sqrt", "log"}, default=None
        The number of features to consider when looking for the best split:

        - If int, then consider `max_features` features at each split.
        - If float, then `max_features` is a fraction and
          `round(max_features * n_features)` features are considered at each
          split.
        - If "auto", then `max_features=sqrt(n_features)`.
        - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
        - If "log2", then `max_features=log2(n_features)`.
        - If None, then `max_features=n_features`.

        Note: the search for a split does not stop until at least one
        valid partition of the node samples is found, even if it requires to
        effectively inspect more than ``max_features`` features.

    max_samples : int or float, default=1.0
        The number of samples to draw from X to train each base estimator
        (with replacement by default, see ``bootstrap`` for more details).

        - If int, then draw `max_samples` samples.
        - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`.

    min_samples_leaf : int or float, default=1
        The minimum number of samples required to be at a leaf node.
        A split point at any depth will only be considered if it leaves at
        least ``min_samples_leaf`` training samples in each of the left and
        right branches.

        - If int, then consider `min_samples_leaf` as the minimum number.
        - If float, then `min_samples_leaf` is a fraction and
          `ceil(min_samples_leaf * n_samples)` are the minimum
          number of samples for each node.

    n_iter: int, default=10
        Number of iterations to set the hyperparameters of each node base
        regressor in Hyperopt.

    metric: string, default="neg_mean_squared_error"
        The score of the base regressor optimized by Hyperopt. Supported metrics
        are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.

    base_learner : {"catboost", "lightgbm", "xgboost"}, default="xgboost"
        The base classifier trained in each node of a tree.

    base_n_estimators : tuple, default=(10, 50, 100)
        The number of estimators of the base learner. The tuple provided is
        the search space used for the hyperparameter optimization (Hyperopt).

    base_max_depth : tuple, default=(3, 6, 9)
        Maximum tree depth for base learners. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).
        
    base_num_leaves : tuple, default=(20, 50, 100, 500)
        Maximum tree leaves (applicable to LightGBM only). The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
        `learning_rate` of the base learner. The tuple provided is the search space used for the
        hyperparameter optimization (Hyperopt).

    base_booster : ("dart", "gblinear", "gbtree"), default=("gbtree",)
        The type of booster to use (applicable to XGBoost only). "gbtree" and "dart" use tree based models
        while "gblinear" uses linear functions. The tuple provided is the search space used
        for the hyperparameter optimization (Hyperopt).
        
    base_boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",)
        The type of boosting type to use (applicable to LightGBM only): "dart" dropouts meet Multiple Additive 
        Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 
        The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).

    base_gamma : tuple, default=(0, 1, 10)
        `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction
        required to make a further partition on a leaf node of the tree.
        The larger `gamma` is, the more conservative XGBoost algorithm will be.
        The tuple provided is the search space used for the hyperparameter optimization
        (Hyperopt).

    base_min_child_weight : tuple, default=(1, 5, 15, 100)
        `min_child_weight` of base learner (applicable to LightGBM and XGBoost only). `min_child_weight` defines the
        minimum sum of instance weight (hessian) needed in a child. If the tree
        partition step results in a leaf node with the sum of instance weight
        less than `min_child_weight`, then the building process will give up further
        partitioning. The larger `min_child_weight` is, the more conservative the base learner
        algorithm will be. The tuple provided is the search space used for the hyperparameter
        optimization (Hyperopt).

    base_subsample : tuple, default=(1.0,)
        Base learner subsample ratio of the training instances (applicable to LightGBM and XGBoost only). 
        Setting it to 0.5 means that the base learner would randomly sample half of the training data prior to
        growing trees, and this will prevent overfitting. Subsampling will occur
        once in every boosting iteration. The tuple provided is the search space used for
        the hyperparameter optimization (Hyperopt).
        
    base_subsample_for_bin : tuple, default=(200000,)
        Number of samples for constructing bins (applicable to LightGBM only). The tuple provided is the
        search space used for the hyperparameter optimization (Hyperopt).

    base_colsample_bytree : tuple, default=(1.0,)
        Base learner subsample ratio of columns when constructing each tree (applicable to LightGBM and XGBoost only).
        Subsampling occurs once for every tree constructed. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_colsample_bylevel : tuple, default=(1.0,)
        Subsample ratio of columns for each level (applicable to CatBoost and XGBoost only). Subsampling occurs
        once for every new depth level reached in a tree. Columns are subsampled
        from the set of columns chosen for the current tree. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_colsample_bynode : tuple, default=(1.0,)
        Subsample ratio of columns for each node split (applicable to XGBoost only). Subsampling
        occurs once every time a new split is evaluated. Columns are subsampled
        from the set of columns chosen for the current level. The tuple provided is the search
        space used for the hyperparameter optimization (Hyperopt).

    base_reg_alpha : tuple, default=(0,)
        `reg_alpha` of the base learner (applicable to LightGBM and XGBoost only). 
        `reg_alpha` corresponds to the L1 regularization term on the weights. 
        Increasing this value will make the base learner more conservative. 
        The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).

    base_reg_lambda : tuple, default=(0.1, 1.0, 5.0)
        `reg_lambda` of the base learner. `reg_lambda` corresponds to the L2 regularization term 
        on the weights. Increasing this value will make the base learner more
        conservative. The tuple provided is the search space used for the hyperparameter
        optimization (Hyperopt).

    n_jobs : int, default=None
        The number of jobs to run in parallel.
        ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.

    random_state : int, RandomState instance or None, default=None
        Controls the randomness of the bootstrapping of the samples used
        when building trees (if ``bootstrap=True``), the sampling of the
        features to consider when looking for the best split at each node
        (if ``max_features < n_features``), the base classifier (XGBoost) and
        the Hyperopt algorithm.

    verbose : int, default=0
        Controls the verbosity when fitting.

    Attributes
    ----------
    base_estimator_ : LCETreeRegressor
        The child estimator template used to create the collection of fitted
        sub-estimators.

    estimators_ : list of LCETreeRegressor
        The collection of fitted sub-estimators.

    n_features_in_ : int
        The number of features when ``fit`` is performed.

    Notes
    -----
    The default values for the parameters controlling the size of the trees
    (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
    unpruned trees which can potentially be very large on some data sets. To
    reduce memory consumption, the complexity and size of the trees should be
    controlled by setting those parameter values.

    The features are always randomly permuted at each split. Therefore,
    the best found split may vary, even with the same training data,
    ``max_features=n_features`` and ``bootstrap=False``, if the improvement
    of the criterion is identical for several splits enumerated during the
    search of the best split. To obtain a deterministic behaviour during
    fitting, ``random_state`` has to be fixed.
    """

[docs]    def __init__(
        self,
        n_estimators=10,
        bootstrap=True,
        criterion="squared_error",
        splitter="best",
        max_depth=2,
        max_features=None,
        max_samples=1.0,
        min_samples_leaf=1,
        metric="neg_mean_squared_error",
        n_iter=10,
        base_learner="xgboost",
        base_n_estimators=(10, 50, 100),
        base_max_depth=(3, 6, 9),
        base_num_leaves=(20, 50, 100, 500),
        base_learning_rate=(0.01, 0.1, 0.3, 0.5),
        base_booster=("gbtree",),
        base_boosting_type=("gbdt",),
        base_gamma=(0, 1, 10),
        base_min_child_weight=(1, 5, 15, 100),
        base_subsample=(1.0,),
        base_subsample_for_bin=(200000,),
        base_colsample_bytree=(1.0,),
        base_colsample_bylevel=(1.0,),
        base_colsample_bynode=(1.0,),
        base_reg_alpha=(0,),
        base_reg_lambda=(0.1, 1.0, 5.0),
        n_jobs=None,
        random_state=None,
        verbose=0,
    ):
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.max_features = max_features
        self.max_samples = max_samples
        self.min_samples_leaf = min_samples_leaf
        self.n_iter = n_iter
        self.metric = metric
        self.base_learner = base_learner
        self.base_n_estimators = base_n_estimators
        self.base_max_depth = base_max_depth
        self.base_num_leaves = base_num_leaves
        self.base_learning_rate = base_learning_rate
        self.base_booster = base_booster
        self.base_boosting_type = base_boosting_type
        self.base_gamma = base_gamma
        self.base_min_child_weight = base_min_child_weight
        self.base_subsample = base_subsample
        self.base_subsample_for_bin = base_subsample_for_bin
        self.base_colsample_bytree = base_colsample_bytree
        self.base_colsample_bylevel = base_colsample_bylevel
        self.base_colsample_bynode = base_colsample_bynode
        self.base_reg_alpha = base_reg_alpha
        self.base_reg_lambda = base_reg_lambda
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def _generate_estimator(self):
        """Generate an estimator."""
        est = LCETreeRegressor()
        est.criterion = self.criterion
        est.splitter = self.splitter
        est.max_depth = self.max_depth
        est.max_features = self.max_features
        est.min_samples_leaf = self.min_samples_leaf
        est.n_iter = self.n_iter
        est.metric = self.metric
        est.base_learner = self.base_learner
        est.base_n_estimators = self.base_n_estimators
        est.base_max_depth = self.base_max_depth
        est.base_num_leaves = self.base_num_leaves
        est.base_learning_rate = self.base_learning_rate
        est.base_booster = self.base_booster
        est.base_boosting_type = self.base_boosting_type
        est.base_gamma = self.base_gamma
        est.base_min_child_weight = self.base_min_child_weight
        est.base_subsample = self.base_subsample
        est.base_subsample_for_bin = self.base_subsample_for_bin        
        est.base_colsample_bytree = self.base_colsample_bytree        
        est.base_colsample_bylevel = self.base_colsample_bylevel
        est.base_colsample_bynode = self.base_colsample_bynode
        est.base_reg_alpha = self.base_reg_alpha
        est.base_reg_alpha = self.base_reg_lambda
        est.n_jobs = self.n_jobs
        est.random_state = self.random_state
        est.verbose = self.verbose
        return est

    def _more_tags(self):
        """Update scikit-learn estimator tags."""
        return {"allow_nan": True, "requires_y": True}

    def _validate_extra_parameters(self, X):
        """Validate parameters not already validated by methods employed."""
        # Validate max_depth
        if isinstance(self.max_depth, numbers.Integral):
            if not (0 <= self.max_depth):
                raise ValueError(
                    "max_depth must be greater than or equal to 0, "
                    "got {0}.".format(self.max_depth)
                )
        else:
            raise ValueError("max_depth must be int")

        # Validate min_samples_leaf
        if isinstance(self.min_samples_leaf, numbers.Integral):
            if not 1 <= self.min_samples_leaf:
                raise ValueError(
                    "min_samples_leaf must be at least 1 "
                    "or in (0, 0.5], got %s" % self.min_samples_leaf
                )
        elif isinstance(self.min_samples_leaf, float):
            if not 0.0 < self.min_samples_leaf <= 0.5:
                raise ValueError(
                    "min_samples_leaf must be at least 1 "
                    "or in (0, 0.5], got %s" % self.min_samples_leaf
                )
            self.min_samples_leaf = int(math.ceil(self.min_samples_leaf * X.shape[0]))
        else:
            raise ValueError("min_samples_leaf must be int or float")

        # Validate n_iter
        if isinstance(self.n_iter, numbers.Integral):
            if self.n_iter <= 0:
                raise ValueError(
                    "n_iter must be greater than 0, " "got {0}.".format(self.n_iter)
                )
        else:
            raise ValueError("n_iter must be int")

        # Validate verbose
        if isinstance(self.verbose, numbers.Integral):
            if self.verbose < 0:
                raise ValueError(
                    "verbose must be greater than or equal to 0, "
                    "got {0}.".format(self.verbose)
                )
        else:
            raise ValueError("verbose must be int")

[docs]    def fit(self, X, y):
        """
        Build a forest of LCE trees from the training set (X, y).

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like of shape (n_samples,)
            The target values (real numbers).

        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, y_numeric=True, force_all_finite="allow-nan")
        self._validate_extra_parameters(X)
        self.n_features_in_ = X.shape[1]
        self.X_ = True
        self.y_ = True
        self.base_estimator_ = self._generate_estimator()
        self.estimators_ = BaggingRegressor(
            base_estimator=self.base_estimator_,
            n_estimators=self.n_estimators,
            bootstrap=self.bootstrap,
            max_samples=self.max_samples,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
        )
        self.estimators_.fit(X, y)
        return self

[docs]    def predict(self, X):
        """
        Predict regression target for X.
        The predicted regression target of an input sample is computed as the
        mean predicted regression targets of the trees in the forest.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        y : ndarray of shape (n_samples,)
            The predicted values.
        """
        check_is_fitted(self, ["X_", "y_"])
        X = check_array(X, force_all_finite="allow-nan")
        return self.estimators_.predict(X)

[docs]    def set_params(self, **params):
        """
        Set the parameters of the estimator.

        Parameters
        ----------
        **params : dict
            Estimator parameters.

        Returns
        -------
        self : object
        """
        if not params:
            return self

        for key, value in params.items():
            if hasattr(self, key):
                setattr(self, key, value)

        return self