Source code for rep.estimators.xgboost

"""
These classes are wrappers for `XGBoost library <https://github.com/dmlc/xgboost>`_.
"""
from __future__ import division, print_function, absolute_import

import tempfile
import os
from abc import ABCMeta

import pandas
import numpy
from sklearn.utils import check_random_state

from .utils import normalize_weights, remove_first_line
from .interface import Classifier, Regressor
from .utils import check_inputs

__author__ = 'Mikhail Hushchyn, Alex Rogozhnikov'
__all__ = ['XGBoostBase', 'XGBoostClassifier', 'XGBoostRegressor']

try:
    import xgboost as xgb
except ImportError as e:
    raise ImportError("please install xgboost")


[docs]class XGBoostBase(object):
    """
    A base class for the XGBoostClassifier and XGBoostRegressor. XGBoost tree booster is used.

    :param int n_estimators: number of trees built.
    :param int nthreads: number of parallel threads used to run XGBoost.
    :param num_feature: feature dimension used in boosting, set to maximum dimension of the feature
        (set automatically by XGBoost, no need to be set by user).
    :type num_feature: None or int
    :param float gamma: minimum loss reduction required to make a further partition on a leaf node of the tree.
        The larger, the more conservative the algorithm will be.
    :type gamma: None or float
    :param float eta: (or learning rate) step size shrinkage used in update to prevent overfitting.
        After each boosting step, we can directly get the weights of new features
        and eta actually shrinkages the feature weights to make the boosting process more conservative.
    :param int max_depth: maximum depth of a tree.
    :param float scale_pos_weight: ration of weights of the class 1 to the weights of the class 0.
    :param float min_child_weight: minimum sum of instance weight (hessian) needed in a child.
        If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight,
        then the building process will give up further partitioning.

        .. note:: weights are normalized so that mean=1 before fitting. Roughly min_child_weight is equal to the number of events.
    :param float subsample: subsample ratio of the training instance.
        Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees
        and this will prevent overfitting.
    :param float colsample: subsample ratio of columns when constructing each tree.
    :param float base_score: the initial prediction score of all instances, global bias.
    :param random_state: state for a pseudo random generator
    :type random_state: None or int or RandomState
    :param boot verbose: if 1, will print messages during training
    :param float missing: the number considered by XGBoost as missing value.
    """

    __metaclass__ = ABCMeta

    def __init__(self,
                 n_estimators=100,
                 nthreads=16,
                 num_feature=None,
                 gamma=None,
                 eta=0.3,
                 max_depth=6,
                 scale_pos_weight=1.,
                 min_child_weight=1.,
                 subsample=1.,
                 colsample=1.,
                 base_score=0.5,
                 verbose=0,
                 missing=-999.,
                 random_state=0):

        self.n_estimators = n_estimators
        self.missing = missing
        self.nthreads = nthreads
        self.num_feature = num_feature
        self.gamma = gamma
        self.eta = eta
        self.max_depth = max_depth
        self.scale_pos_weight = scale_pos_weight
        self.min_child_weight = min_child_weight
        self.subsample = subsample
        self.colsample = colsample
        self.objective = None
        self.base_score = base_score
        self.verbose = verbose
        self.random_state = random_state
        self._num_class = None
        self.xgboost_estimator = None

    def _make_dmatrix(self, X, y=None, sample_weight=None):
        """
        Create XGBoost data from initial data.

        :return: XGBoost DMatrix
        """
        feature_names = [str(i) for i in range(X.shape[1])]
        matrix = xgb.DMatrix(data=X, label=y, weight=sample_weight,
                             missing=self.missing, feature_names=feature_names)
        return matrix

    def _check_fitted(self):
        assert self.xgboost_estimator is not None, "Classifier wasn't fitted, please call `fit` first"

    def _fit(self, X, y, estimator_type, sample_weight=None, **kwargs):
        """
        Train a classification/regression model on the data.

        :param pandas.DataFrame X: data of shape [n_samples, n_features]
        :param y: labels of samples, array-like of shape [n_samples]
        :param sample_weight: weight of samples,
               array-like of shape [n_samples] or None if all weights are equal
        :param str estimator_type: type of the estimator (binary, reg or mult)
        :param dict kwargs: additional parameters
        :return: self
        """
        if self.random_state is None:
            seed = 0
        elif isinstance(self.random_state, int):
            seed = self.random_state
        else:
            seed = check_random_state(self.random_state).randint(0, 10000)

        self.objective = estimator_type
        params = {"nthread": self.nthreads,
                  "eta": self.eta,
                  "max_depth": self.max_depth,
                  "scale_pos_weight": self.scale_pos_weight,
                  "min_child_weight": self.min_child_weight,
                  "subsample": self.subsample,
                  "colsample_bytree": self.colsample,
                  "objective": self.objective,
                  "base_score": self.base_score,
                  "silent": int(not self.verbose),
                  "seed": seed}
        for key, value in kwargs.items():
            params[key] = value
            if key == 'num_class':
                self._num_class = value

        if self.num_feature is not None:
            params["num_feature"] = self.num_feature
        if self.gamma is not None:
            params["gamma"] = self.gamma

        xgboost_matrix = self._make_dmatrix(X, y, sample_weight)
        self.xgboost_estimator = xgb.train(params, xgboost_matrix, num_boost_round=self.n_estimators)

        return self

    def __getstate__(self):
        result = self.__dict__.copy()
        del result['xgboost_estimator']
        if self.xgboost_estimator is None:
            result['dumped_xgboost'] = None
        else:
            with tempfile.NamedTemporaryFile() as dump:
                self._save_model(dump.name)
                with open(dump.name, 'rb') as dumpfile:
                    result['dumped_xgboost'] = dumpfile.read()
        return result

    def __setstate__(self, dict):
        self.__dict__ = dict
        if dict['dumped_xgboost'] is None:
            self.xgboost_estimator = None
        else:
            with tempfile.NamedTemporaryFile() as dump:
                with open(dump.name, 'wb') as dumpfile:
                    dumpfile.write(dict['dumped_xgboost'])
                self._load_model(dump.name)
            # HACK error in xgboost reloading
            if '_num_class' in dict:
                self.xgboost_estimator.set_param({'num_class': dict['_num_class']})
        del dict['dumped_xgboost']

    def _save_model(self, path_to_dump):
        """ Save XGBoost model"""
        self._check_fitted()
        self.xgboost_estimator.save_model(path_to_dump)

    def _load_model(self, path_to_dumped_model):
        """ Load XGBoost model to estimator """
        assert os.path.exists(path_to_dumped_model), 'there is no such file: {}'.format(path_to_dumped_model)
        self.xgboost_estimator = xgb.Booster({'nthread': self.nthreads}, model_file=path_to_dumped_model)

[docs]    def get_feature_importances(self):
        """
        Get features importances.

        :rtype: pandas.DataFrame with `index=self.features`
        """
        self._check_fitted()
        feature_score = self.xgboost_estimator.get_fscore()
        reordered_scores = numpy.zeros(len(self.features))
        for name, score in feature_score.items():
            reordered_scores[int(name)] = score
        return pandas.DataFrame({'effect': reordered_scores}, index=self.features)

    @property
    def feature_importances_(self):
        """Sklearn-way of returning feature importance.
        This returned as numpy.array, assuming that initially passed train_features=None """
        self._check_fitted()
        return self.get_feature_importances().ix[self.features, 'effect'].values


[docs]class XGBoostClassifier(XGBoostBase, Classifier):
    __doc__ = 'Implements classification model from XGBoost library. \n'\
              + remove_first_line(XGBoostBase.__doc__)

    def __init__(self, features=None,
                 n_estimators=100,
                 nthreads=16,
                 num_feature=None,
                 gamma=None,
                 eta=0.3,
                 max_depth=6,
                 scale_pos_weight=1.,
                 min_child_weight=1.,
                 subsample=1.,
                 colsample=1.,
                 base_score=0.5,
                 verbose=0,
                 missing=-999.,
                 random_state=0):

        XGBoostBase.__init__(self,
                             n_estimators=n_estimators,
                             nthreads=nthreads,
                             num_feature=num_feature,
                             gamma=gamma,
                             eta=eta,
                             max_depth=max_depth,
                             scale_pos_weight=scale_pos_weight,
                             min_child_weight=min_child_weight,
                             subsample=subsample,
                             colsample=colsample,
                             base_score=base_score,
                             verbose=verbose,
                             missing=missing,
                             random_state=random_state)

        Classifier.__init__(self, features=features)

[docs]    def fit(self, X, y, sample_weight=None):
        X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False)
        sample_weight = normalize_weights(y, sample_weight=sample_weight, per_class=False)
        X = self._get_features(X)
        self._set_classes(y)
        if self.n_classes_ >= 2:
            return self._fit(X, y, 'multi:softprob', sample_weight=sample_weight, num_class=self.n_classes_)

    fit.__doc__ = Classifier.fit.__doc__

[docs]    def predict_proba(self, X):
        self._check_fitted()
        X_dmat = self._make_dmatrix(self._get_features(X))
        prediction = self.xgboost_estimator.predict(X_dmat, ntree_limit=0)
        if self.n_classes_ >= 2:
            return prediction.reshape(X.shape[0], self.n_classes_)

    predict_proba.__doc__ = Classifier.predict_proba.__doc__

[docs]    def staged_predict_proba(self, X, step=None):
        """
        Predict probabilities for data for each class label on each stage..

        :param pandas.DataFrame X: data of shape [n_samples, n_features]
        :param int step: step for returned iterations (None by default).
            XGBoost does not implement this functionality and we need to
            predict from the beginning each time.
            With `None` passed step is chosen to have 10 points in the learning curve.
        :return: iterator

        .. warning: this method may be very slow, it takes iterations^2 / step time.
        """
        self._check_fitted()
        X_dmat = self._make_dmatrix(self._get_features(X))
        if step is None:
            step = max(self.n_estimators // 10, 1)

        # TODO use applying tree-by-tree
        for i in range(1, self.n_estimators // step + 1):
            prediction = self.xgboost_estimator.predict(X_dmat, ntree_limit=i * step)
            yield prediction.reshape(X.shape[0], self.n_classes_)


[docs]class XGBoostRegressor(XGBoostBase, Regressor):
    __doc__ = 'Implements regression model from XGBoost library. \n' + remove_first_line(XGBoostBase.__doc__)

    def __init__(self, features=None,
                 n_estimators=100,
                 nthreads=16,
                 num_feature=None,
                 gamma=None,
                 eta=0.3,
                 max_depth=6,
                 min_child_weight=1.,
                 subsample=1.,
                 colsample=1.,
                 objective_type='linear',
                 base_score=0.5,
                 verbose=0,
                 missing=-999.,
                 random_state=0):
        XGBoostBase.__init__(self,
                             n_estimators=n_estimators,
                             nthreads=nthreads,
                             num_feature=num_feature,
                             gamma=gamma,
                             eta=eta,
                             max_depth=max_depth,
                             min_child_weight=min_child_weight,
                             subsample=subsample,
                             colsample=colsample,
                             base_score=base_score,
                             verbose=verbose,
                             missing=missing,
                             random_state=random_state)

        Regressor.__init__(self, features=features)
        self.objective_type = objective_type

[docs]    def fit(self, X, y, sample_weight=None):
        X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False)
        sample_weight = normalize_weights(y, sample_weight=sample_weight, per_class=False)
        X = self._get_features(X)
        assert self.objective_type in {'linear', 'logistic'}, 'Objective parameter is not valid'
        return self._fit(X, y, "reg:{}".format(self.objective_type), sample_weight=sample_weight)

    fit.__doc__ = Regressor.fit.__doc__

[docs]    def predict(self, X):
        self._check_fitted()
        X_dmat = self._make_dmatrix(self._get_features(X))
        return self.xgboost_estimator.predict(X_dmat, ntree_limit=0)

    predict.__doc__ = Regressor.predict.__doc__

[docs]    def staged_predict(self, X, step=None):
        """
        Predicts values for data on each stage.

        :param X: pandas.DataFrame of shape [n_samples, n_features]
        :param int step: step for returned iterations (None by default).
            XGBoost does not implement this functionality and we need to
            predict from the beginning each time.
            With `None` passed step is chosen to have 10 points in the learning curve.
        :return: iterator

        .. warning: this method may be very slow, it takes iterations^2 / step time.
        """
        self._check_fitted()
        X_dmat = self._make_dmatrix(self._get_features(X))
        if step is None:
            step = max(self.n_estimators // 10, 1)

        # TODO use applying tree-by-tree
        for i in range(1, self.n_estimators // step + 1):
            yield self.xgboost_estimator.predict(X_dmat, ntree_limit=i * step)