Source code for rep.estimators.interface

"""
**REP** wrappers are derived from :class:`Classifier` and :class:`Regressor`
depending on the problem of interest.

Below you can see the standard methods available in the wrappers.

"""
from __future__ import division, print_function, absolute_import
from abc import ABCMeta, abstractmethod

import numpy
import pandas
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin

from .utils import _get_features

__author__ = 'Tatiana Likhomanenko, Alex Rogozhnikov'

_docs = \
    """
    Interface to train different **{}** models from different machine learning libraries, like **sklearn, TMVA, XGBoost**, ...

    :param features: features used to train a model
    :type features: list[str] or None

    .. note::
        * if `features` aren't set (**None**), then all features in the training dataset will be used

        * Datasets should be `pandas.DataFrame`, not `numpy.array`.
          Provided this, you'll be able to choose features used in training by setting e.g.
          `features=['mass', 'momentum']` in the constructor.

        * It works fine with `numpy.array` as well, but in this case all the features will be used.
    """


[docs]class Classifier(BaseEstimator, ClassifierMixin): __doc__ = _docs.format('classification') + \ """ * Classes values must be from 0 to n_classes-1! """ __metaclass__ = ABCMeta def __init__(self, features=None): self.features = list(features) if features is not None else features def _get_features(self, X, allow_nans=False): """ Return data with the necessary features. :param pandas.DataFrame X: training data :return: pandas.DataFrame with necessary features """ X_prepared, self.features = _get_features(self.features, X, allow_nans=allow_nans) return X_prepared def _set_classes(self, y): self.classes_, indices = numpy.unique(y, return_index=True) self.n_classes_ = len(self.classes_) assert self.n_classes_ >= 2, "Number of labels must be >= 2 (data contain {})".format(self.n_classes_) assert numpy.all(self.classes_ == numpy.arange(self.n_classes_)), \ 'Labels must be from 0..n_classes-1, instead of {}'.format(self.classes_) return indices @abstractmethod
[docs] def fit(self, X, y, sample_weight=None): """ Train a classification model on the data. :param pandas.DataFrame X: data of shape [n_samples, n_features] :param y: labels of samples, array-like of shape [n_samples] :param sample_weight: weight of samples, array-like of shape [n_samples] or None if all weights are equal :return: self """ pass
[docs] def predict(self, X): """ Predict labels for all samples in the dataset. :param pandas.DataFrame X: data of shape [n_samples, n_features] :rtype: numpy.array of shape [n_samples] with integer labels """ proba = self.predict_proba(X) return self.classes_.take(numpy.argmax(proba, axis=1), axis=0)
@abstractmethod
[docs] def predict_proba(self, X): """ Predict probabilities for each class label for samples. :param pandas.DataFrame X: data of shape [n_samples, n_features] :rtype: numpy.array of shape [n_samples, n_classes] with probabilities """ pass
@abstractmethod
[docs] def staged_predict_proba(self, X): """ Predict probabilities for data for each class label on each stage (i.e. for boosting algorithms). :param pandas.DataFrame X: data of shape [n_samples, n_features] :rtype: iterator """ pass
[docs] def get_feature_importances(self): """ Return features importance. :rtype: pandas.DataFrame with `index=self.features` """ try: return pandas.DataFrame({"effect": self.feature_importances_}, index=self.features) except AttributeError: raise AttributeError("Haven't feature_importances_ property")
[docs] def fit_lds(self, lds): """ Train a classifier on the specific type of dataset. :param LabeledDataStorage lds: data :return: self """ X, y, sample_weight = lds.get_data(self.features), lds.get_targets(), lds.get_weights(allow_nones=True) return self.fit(X, y, sample_weight=sample_weight)
[docs] def test_on_lds(self, lds): """ Prepare a classification report for a single classifier. :param LabeledDataStorage lds: data :return: ClassificationReport """ from ..report import ClassificationReport return ClassificationReport(classifiers={'clf': self}, lds=lds)
[docs] def test_on(self, X, y, sample_weight=None): """ Prepare classification report for a single classifier. :param pandas.DataFrame X: data of shape [n_samples, n_features] :param y: labels of samples --- array-like of shape [n_samples] :param sample_weight: weight of samples, array-like of shape [n_samples] or None if all weights are equal :return: ClassificationReport """ from ..data import LabeledDataStorage lds = LabeledDataStorage(data=X, target=y, sample_weight=sample_weight) return self.test_on_lds(lds=lds)
[docs]class Regressor(BaseEstimator, RegressorMixin): __doc__ = _docs.format('regression') __metaclass__ = ABCMeta def __init__(self, features=None): self.features = list(features) if features is not None else features def _get_features(self, X, allow_nans=False): """ Return data with the necessary features. :param pandas.DataFrame X: training data :return: pandas.DataFrame with necessary features """ X_prepared, self.features = _get_features(self.features, X, allow_nans=allow_nans) return X_prepared @abstractmethod
[docs] def fit(self, X, y, sample_weight=None): """ Train a regression model on the data. :param pandas.DataFrame X: data of shape [n_samples, n_features] :param y: values for samples, array-like of shape [n_samples] :param sample_weight: weight of samples, array-like of shape [n_samples] or None if all weights are equal :return: self """ pass
@abstractmethod
[docs] def predict(self, X): """ Predict values for data. :param pandas.DataFrame X: data of shape [n_samples, n_features] :rtype: numpy.array of shape [n_samples] with predicted values """ pass
@abstractmethod
[docs] def staged_predict(self, X): """ Predicts values for data on each stage (i.e. for boosting algorithms). :param pandas.DataFrame X: data of shape [n_samples, n_features] :rtype: iterator """ pass
[docs] def fit_lds(self, lds): """ Train a regression model on the specific type of dataset. :param LabeledDataStorage lds: data :return: self """ X, y, sample_weight = lds.get_data(self.features), lds.get_targets(), lds.get_weights() if sample_weight is None: return self.fit(X, y) else: return self.fit(X, y, sample_weight=sample_weight)
[docs] def get_feature_importances(self): """ Get features importances. :rtype: pandas.DataFrame with `index=self.features` """ try: return pandas.DataFrame({"effect": self.feature_importances_}, index=self.features) except AttributeError: raise AttributeError("Classifier doesn't provide feature_importances_ property")
[docs] def test_on_lds(self, lds): """ Prepare a regression report for a single regressor. :param LabeledDataStorage lds: data :return: RegressionReport """ from ..report import RegressionReport return RegressionReport(regressors={'clf': self}, lds=lds)
[docs] def test_on(self, X, y, sample_weight=None): """ Prepare a regression report for a single regressor :param pandas.DataFrame X: data of shape [n_samples, n_features] :param y: values of samples --- array-like of shape [n_samples] :param sample_weight: weight of samples, array-like of shape [n_samples] or None if all weights are equal :return: RegressionReport """ from ..data import LabeledDataStorage lds = LabeledDataStorage(data=X, target=y, sample_weight=sample_weight) return self.test_on_lds(lds=lds)