Source code for rep.estimators.sklearn

"""
:class:`SklearnClassifier` and :class:`SklearnRegressor` are wrappers for algorithms from scikit-learn.

From user perspective, wrapped sklearn model behaves in the same way as non-wrapped,
but has one additional parameter *features* to choose necessary columns to use in training.

Typically, models from **REP** are used with `pandas.DataFrames`,
which makes it possible to name needed variables or give some variables specific role in the training.

If data has :class:`numpy.array` type then behaviour will be the same as in sklearn.
For complete list of the available algorithms, see `sklearn API <http://scikit-learn.org/stable/modules/classes.html>`_.
"""
from __future__ import division, print_function, absolute_import
from abc import ABCMeta

from .interface import Classifier, Regressor
from .utils import check_inputs

__author__ = 'Alex Rogozhnikov'

__all__ = ['SklearnClassifier', 'SklearnRegressor']


class SklearnBase(object):
    """
    SklearnBase is a base for wrappers over sklearn-like models.
    All attributes will be returned for base estimator.
    """

    __metaclass__ = ABCMeta

    def __init__(self, clf):
        """
        :param clf: estimator to be used in training.
        """
        self.clf = clf

    def _fit(self, X, y, sample_weight=None, **kwargs):
        """
        Train the {estimator}.

        :param pandas.DataFrame X: data shape [n_samples, n_features]
        :param y: target of training, array-like of shape [n_samples]
        :param sample_weight: weight of samples,
               array-like of shape [n_samples] or None if all weights are equal

        :return: self

        .. note:: if sklearn {estimator} doesn't support `sample_weight`, then put `sample_weight=None`,
            otherwise exception will be thrown.

        """
        if sample_weight is None:
            self.clf.fit(self._get_features(X), y, **kwargs)
        else:
            self.clf.fit(self._get_features(X), y, sample_weight=sample_weight, **kwargs)
        return self

    def __getattr__(self, name):
        # In order not to break pickling/unpickling.
        if name in ['__getstate__', '__setstate__']:
            raise AttributeError()
        # Those methods not defined here will be used directly from the estimator
        base = self.clf
        return base.__getattribute__(name)

    def set_params(self, **params):
        """
        Set the parameters of this estimator.
        Parameters of the base estimator can be accessed (for example param `depth`) by both *depth* and *clf__depth*.

        :param dict params: parameters to set in the model
        """
        params_for_clf = {}
        for name, value in params.items():
            if name == 'features':
                self.features = value
            elif name == 'clf':
                self.clf = value
            elif name.startswith('clf__'):
                params_for_clf[name[5:]] = value
            else:
                params_for_clf[name] = value
        self.clf.set_params(**params_for_clf)


[docs]class SklearnClassifier(SklearnBase, Classifier):
    def __init__(self, clf, features=None):
        """
        SklearnClassifier is a wrapper over sklearn-like **classifiers**.

        :param sklearn.BaseEstimator clf: classifier to train. Should be sklearn-compatible.
        :param features: features used in training
        :type features: list[str] or None
        """
        if isinstance(clf, Classifier):
            raise ValueError('Base class should be simply sklearn classifier, not descendant of Classifier')
        SklearnBase.__init__(self, clf=clf)
        Classifier.__init__(self, features=features)

[docs]    def fit(self, X, y, sample_weight=None, **kwargs):
        X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=True)
        self._set_classes(y)
        return self._fit(X, y, sample_weight=sample_weight, **kwargs)

    fit.__doc__ = SklearnBase._fit.__doc__.format(estimator='classifier')

[docs]    def predict(self, X):
        return self.clf.predict(self._get_features(X))

    predict.__doc__ = Classifier.predict.__doc__

[docs]    def predict_proba(self, X):
        return self.clf.predict_proba(self._get_features(X))

    predict_proba.__doc__ = Classifier.predict_proba.__doc__

[docs]    def staged_predict_proba(self, X):
        return self.clf.staged_predict_proba(self._get_features(X))

    staged_predict_proba.__doc__ = Classifier.staged_predict_proba.__doc__


[docs]class SklearnRegressor(SklearnBase, Regressor):
    def __init__(self, clf, features=None):
        """
        SklearnRegressor is a wrapper over sklearn-like **regressors**

        :param sklearn.BaseEstimator clf: classifier to train. Should be sklearn-compatible.
        :param features: features used in training
        :type features: list[str] or None
        """
        if isinstance(clf, Regressor):
            raise ValueError('Base class should be simply sklearn classifier, not descendant of Regressor')
        SklearnBase.__init__(self, clf=clf)
        Regressor.__init__(self, features=features)

[docs]    def fit(self, X, y, sample_weight=None, **kwargs):
        X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=True)
        return self._fit(X, y, sample_weight=sample_weight, **kwargs)

    fit.__doc__ = SklearnBase._fit.__doc__.format(estimator='classifier')

[docs]    def predict(self, X):
        return self.clf.predict(self._get_features(X))

    predict.__doc__ = Regressor.predict.__doc__

[docs]    def staged_predict(self, X):
        """
        Predicts regression target at each stage for X.
        This method allows monitoring of error after each stage.

        :return: iterator
        """
        return self.clf.staged_predict(self._get_features(X))

    staged_predict.__doc__ = Regressor.staged_predict.__doc__