Source code for rep.estimators.xgboost

"""
These classes are wrappers for `XGBoost library <https://github.com/dmlc/xgboost>`_.
"""
from __future__ import division, print_function, absolute_import

import tempfile
import os
from abc import ABCMeta

import pandas
import numpy
from sklearn.utils import check_random_state

from .utils import normalize_weights, remove_first_line
from .interface import Classifier, Regressor
from .utils import check_inputs

__author__ = 'Mikhail Hushchyn, Alex Rogozhnikov'
__all__ = ['XGBoostBase', 'XGBoostClassifier', 'XGBoostRegressor']

try:
    import xgboost as xgb
except ImportError as e:
    raise ImportError("please install xgboost")


[docs]class XGBoostBase(object): """ A base class for the XGBoostClassifier and XGBoostRegressor. XGBoost tree booster is used. :param int n_estimators: number of trees built. :param int nthreads: number of parallel threads used to run XGBoost. :param num_feature: feature dimension used in boosting, set to maximum dimension of the feature (set automatically by XGBoost, no need to be set by user). :type num_feature: None or int :param float gamma: minimum loss reduction required to make a further partition on a leaf node of the tree. The larger, the more conservative the algorithm will be. :type gamma: None or float :param float eta: (or learning rate) step size shrinkage used in update to prevent overfitting. After each boosting step, we can directly get the weights of new features and eta actually shrinkages the feature weights to make the boosting process more conservative. :param int max_depth: maximum depth of a tree. :param float scale_pos_weight: ration of weights of the class 1 to the weights of the class 0. :param float min_child_weight: minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. .. note:: weights are normalized so that mean=1 before fitting. Roughly min_child_weight is equal to the number of events. :param float subsample: subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. :param float colsample: subsample ratio of columns when constructing each tree. :param float base_score: the initial prediction score of all instances, global bias. :param random_state: state for a pseudo random generator :type random_state: None or int or RandomState :param boot verbose: if 1, will print messages during training :param float missing: the number considered by XGBoost as missing value. """ __metaclass__ = ABCMeta def __init__(self, n_estimators=100, nthreads=16, num_feature=None, gamma=None, eta=0.3, max_depth=6, scale_pos_weight=1., min_child_weight=1., subsample=1., colsample=1., base_score=0.5, verbose=0, missing=-999., random_state=0): self.n_estimators = n_estimators self.missing = missing self.nthreads = nthreads self.num_feature = num_feature self.gamma = gamma self.eta = eta self.max_depth = max_depth self.scale_pos_weight = scale_pos_weight self.min_child_weight = min_child_weight self.subsample = subsample self.colsample = colsample self.objective = None self.base_score = base_score self.verbose = verbose self.random_state = random_state self._num_class = None self.xgboost_estimator = None def _make_dmatrix(self, X, y=None, sample_weight=None): """ Create XGBoost data from initial data. :return: XGBoost DMatrix """ feature_names = [str(i) for i in range(X.shape[1])] matrix = xgb.DMatrix(data=X, label=y, weight=sample_weight, missing=self.missing, feature_names=feature_names) return matrix def _check_fitted(self): assert self.xgboost_estimator is not None, "Classifier wasn't fitted, please call `fit` first" def _fit(self, X, y, estimator_type, sample_weight=None, **kwargs): """ Train a classification/regression model on the data. :param pandas.DataFrame X: data of shape [n_samples, n_features] :param y: labels of samples, array-like of shape [n_samples] :param sample_weight: weight of samples, array-like of shape [n_samples] or None if all weights are equal :param str estimator_type: type of the estimator (binary, reg or mult) :param dict kwargs: additional parameters :return: self """ if self.random_state is None: seed = 0 elif isinstance(self.random_state, int): seed = self.random_state else: seed = check_random_state(self.random_state).randint(0, 10000) self.objective = estimator_type params = {"nthread": self.nthreads, "eta": self.eta, "max_depth": self.max_depth, "scale_pos_weight": self.scale_pos_weight, "min_child_weight": self.min_child_weight, "subsample": self.subsample, "colsample_bytree": self.colsample, "objective": self.objective, "base_score": self.base_score, "silent": int(not self.verbose), "seed": seed} for key, value in kwargs.items(): params[key] = value if key == 'num_class': self._num_class = value if self.num_feature is not None: params["num_feature"] = self.num_feature if self.gamma is not None: params["gamma"] = self.gamma xgboost_matrix = self._make_dmatrix(X, y, sample_weight) self.xgboost_estimator = xgb.train(params, xgboost_matrix, num_boost_round=self.n_estimators) return self def __getstate__(self): result = self.__dict__.copy() del result['xgboost_estimator'] if self.xgboost_estimator is None: result['dumped_xgboost'] = None else: with tempfile.NamedTemporaryFile() as dump: self._save_model(dump.name) with open(dump.name, 'rb') as dumpfile: result['dumped_xgboost'] = dumpfile.read() return result def __setstate__(self, dict): self.__dict__ = dict if dict['dumped_xgboost'] is None: self.xgboost_estimator = None else: with tempfile.NamedTemporaryFile() as dump: with open(dump.name, 'wb') as dumpfile: dumpfile.write(dict['dumped_xgboost']) self._load_model(dump.name) # HACK error in xgboost reloading if '_num_class' in dict: self.xgboost_estimator.set_param({'num_class': dict['_num_class']}) del dict['dumped_xgboost'] def _save_model(self, path_to_dump): """ Save XGBoost model""" self._check_fitted() self.xgboost_estimator.save_model(path_to_dump) def _load_model(self, path_to_dumped_model): """ Load XGBoost model to estimator """ assert os.path.exists(path_to_dumped_model), 'there is no such file: {}'.format(path_to_dumped_model) self.xgboost_estimator = xgb.Booster({'nthread': self.nthreads}, model_file=path_to_dumped_model)
[docs] def get_feature_importances(self): """ Get features importances. :rtype: pandas.DataFrame with `index=self.features` """ self._check_fitted() feature_score = self.xgboost_estimator.get_fscore() reordered_scores = numpy.zeros(len(self.features)) for name, score in feature_score.items(): reordered_scores[int(name)] = score return pandas.DataFrame({'effect': reordered_scores}, index=self.features)
@property def feature_importances_(self): """Sklearn-way of returning feature importance. This returned as numpy.array, assuming that initially passed train_features=None """ self._check_fitted() return self.get_feature_importances().ix[self.features, 'effect'].values
[docs]class XGBoostClassifier(XGBoostBase, Classifier): __doc__ = 'Implements classification model from XGBoost library. \n'\ + remove_first_line(XGBoostBase.__doc__) def __init__(self, features=None, n_estimators=100, nthreads=16, num_feature=None, gamma=None, eta=0.3, max_depth=6, scale_pos_weight=1., min_child_weight=1., subsample=1., colsample=1., base_score=0.5, verbose=0, missing=-999., random_state=0): XGBoostBase.__init__(self, n_estimators=n_estimators, nthreads=nthreads, num_feature=num_feature, gamma=gamma, eta=eta, max_depth=max_depth, scale_pos_weight=scale_pos_weight, min_child_weight=min_child_weight, subsample=subsample, colsample=colsample, base_score=base_score, verbose=verbose, missing=missing, random_state=random_state) Classifier.__init__(self, features=features)
[docs] def fit(self, X, y, sample_weight=None): X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False) sample_weight = normalize_weights(y, sample_weight=sample_weight, per_class=False) X = self._get_features(X) self._set_classes(y) if self.n_classes_ >= 2: return self._fit(X, y, 'multi:softprob', sample_weight=sample_weight, num_class=self.n_classes_)
fit.__doc__ = Classifier.fit.__doc__
[docs] def predict_proba(self, X): self._check_fitted() X_dmat = self._make_dmatrix(self._get_features(X)) prediction = self.xgboost_estimator.predict(X_dmat, ntree_limit=0) if self.n_classes_ >= 2: return prediction.reshape(X.shape[0], self.n_classes_)
predict_proba.__doc__ = Classifier.predict_proba.__doc__
[docs] def staged_predict_proba(self, X, step=None): """ Predict probabilities for data for each class label on each stage.. :param pandas.DataFrame X: data of shape [n_samples, n_features] :param int step: step for returned iterations (None by default). XGBoost does not implement this functionality and we need to predict from the beginning each time. With `None` passed step is chosen to have 10 points in the learning curve. :return: iterator .. warning: this method may be very slow, it takes iterations^2 / step time. """ self._check_fitted() X_dmat = self._make_dmatrix(self._get_features(X)) if step is None: step = max(self.n_estimators // 10, 1) # TODO use applying tree-by-tree for i in range(1, self.n_estimators // step + 1): prediction = self.xgboost_estimator.predict(X_dmat, ntree_limit=i * step) yield prediction.reshape(X.shape[0], self.n_classes_)
[docs]class XGBoostRegressor(XGBoostBase, Regressor): __doc__ = 'Implements regression model from XGBoost library. \n' + remove_first_line(XGBoostBase.__doc__) def __init__(self, features=None, n_estimators=100, nthreads=16, num_feature=None, gamma=None, eta=0.3, max_depth=6, min_child_weight=1., subsample=1., colsample=1., objective_type='linear', base_score=0.5, verbose=0, missing=-999., random_state=0): XGBoostBase.__init__(self, n_estimators=n_estimators, nthreads=nthreads, num_feature=num_feature, gamma=gamma, eta=eta, max_depth=max_depth, min_child_weight=min_child_weight, subsample=subsample, colsample=colsample, base_score=base_score, verbose=verbose, missing=missing, random_state=random_state) Regressor.__init__(self, features=features) self.objective_type = objective_type
[docs] def fit(self, X, y, sample_weight=None): X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=False) sample_weight = normalize_weights(y, sample_weight=sample_weight, per_class=False) X = self._get_features(X) assert self.objective_type in {'linear', 'logistic'}, 'Objective parameter is not valid' return self._fit(X, y, "reg:{}".format(self.objective_type), sample_weight=sample_weight)
fit.__doc__ = Regressor.fit.__doc__
[docs] def predict(self, X): self._check_fitted() X_dmat = self._make_dmatrix(self._get_features(X)) return self.xgboost_estimator.predict(X_dmat, ntree_limit=0)
predict.__doc__ = Regressor.predict.__doc__
[docs] def staged_predict(self, X, step=None): """ Predicts values for data on each stage. :param X: pandas.DataFrame of shape [n_samples, n_features] :param int step: step for returned iterations (None by default). XGBoost does not implement this functionality and we need to predict from the beginning each time. With `None` passed step is chosen to have 10 points in the learning curve. :return: iterator .. warning: this method may be very slow, it takes iterations^2 / step time. """ self._check_fitted() X_dmat = self._make_dmatrix(self._get_features(X)) if step is None: step = max(self.n_estimators // 10, 1) # TODO use applying tree-by-tree for i in range(1, self.n_estimators // step + 1): yield self.xgboost_estimator.predict(X_dmat, ntree_limit=i * step)