"""
**REP** wrappers are derived from :class:`Classifier` and :class:`Regressor`
depending on the problem of interest.
Below you can see the standard methods available in the wrappers.
"""
from __future__ import division, print_function, absolute_import
from abc import ABCMeta, abstractmethod
import numpy
import pandas
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from .utils import _get_features
__author__ = 'Tatiana Likhomanenko, Alex Rogozhnikov'
_docs = \
"""
Interface to train different **{}** models from different machine learning libraries, like **sklearn, TMVA, XGBoost**, ...
:param features: features used to train a model
:type features: list[str] or None
.. note::
* if `features` aren't set (**None**), then all features in the training dataset will be used
* Datasets should be `pandas.DataFrame`, not `numpy.array`.
Provided this, you'll be able to choose features used in training by setting e.g.
`features=['mass', 'momentum']` in the constructor.
* It works fine with `numpy.array` as well, but in this case all the features will be used.
"""
[docs]class Classifier(BaseEstimator, ClassifierMixin):
__doc__ = _docs.format('classification') + \
"""
* Classes values must be from 0 to n_classes-1!
"""
__metaclass__ = ABCMeta
def __init__(self, features=None):
self.features = list(features) if features is not None else features
def _get_features(self, X, allow_nans=False):
"""
Return data with the necessary features.
:param pandas.DataFrame X: training data
:return: pandas.DataFrame with necessary features
"""
X_prepared, self.features = _get_features(self.features, X, allow_nans=allow_nans)
return X_prepared
def _set_classes(self, y):
self.classes_, indices = numpy.unique(y, return_index=True)
self.n_classes_ = len(self.classes_)
assert self.n_classes_ >= 2, "Number of labels must be >= 2 (data contain {})".format(self.n_classes_)
assert numpy.all(self.classes_ == numpy.arange(self.n_classes_)), \
'Labels must be from 0..n_classes-1, instead of {}'.format(self.classes_)
return indices
@abstractmethod
[docs] def fit(self, X, y, sample_weight=None):
"""
Train a classification model on the data.
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:param y: labels of samples, array-like of shape [n_samples]
:param sample_weight: weight of samples,
array-like of shape [n_samples] or None if all weights are equal
:return: self
"""
pass
[docs] def predict(self, X):
"""
Predict labels for all samples in the dataset.
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:rtype: numpy.array of shape [n_samples] with integer labels
"""
proba = self.predict_proba(X)
return self.classes_.take(numpy.argmax(proba, axis=1), axis=0)
@abstractmethod
[docs] def predict_proba(self, X):
"""
Predict probabilities for each class label for samples.
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:rtype: numpy.array of shape [n_samples, n_classes] with probabilities
"""
pass
@abstractmethod
[docs] def staged_predict_proba(self, X):
"""
Predict probabilities for data for each class label on each stage (i.e. for boosting algorithms).
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:rtype: iterator
"""
pass
[docs] def get_feature_importances(self):
"""
Return features importance.
:rtype: pandas.DataFrame with `index=self.features`
"""
try:
return pandas.DataFrame({"effect": self.feature_importances_}, index=self.features)
except AttributeError:
raise AttributeError("Haven't feature_importances_ property")
[docs] def fit_lds(self, lds):
"""
Train a classifier on the specific type of dataset.
:param LabeledDataStorage lds: data
:return: self
"""
X, y, sample_weight = lds.get_data(self.features), lds.get_targets(), lds.get_weights(allow_nones=True)
return self.fit(X, y, sample_weight=sample_weight)
[docs] def test_on_lds(self, lds):
"""
Prepare a classification report for a single classifier.
:param LabeledDataStorage lds: data
:return: ClassificationReport
"""
from ..report import ClassificationReport
return ClassificationReport(classifiers={'clf': self}, lds=lds)
[docs] def test_on(self, X, y, sample_weight=None):
"""
Prepare classification report for a single classifier.
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:param y: labels of samples --- array-like of shape [n_samples]
:param sample_weight: weight of samples,
array-like of shape [n_samples] or None if all weights are equal
:return: ClassificationReport
"""
from ..data import LabeledDataStorage
lds = LabeledDataStorage(data=X, target=y, sample_weight=sample_weight)
return self.test_on_lds(lds=lds)
[docs]class Regressor(BaseEstimator, RegressorMixin):
__doc__ = _docs.format('regression')
__metaclass__ = ABCMeta
def __init__(self, features=None):
self.features = list(features) if features is not None else features
def _get_features(self, X, allow_nans=False):
"""
Return data with the necessary features.
:param pandas.DataFrame X: training data
:return: pandas.DataFrame with necessary features
"""
X_prepared, self.features = _get_features(self.features, X, allow_nans=allow_nans)
return X_prepared
@abstractmethod
[docs] def fit(self, X, y, sample_weight=None):
"""
Train a regression model on the data.
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:param y: values for samples, array-like of shape [n_samples]
:param sample_weight: weight of samples,
array-like of shape [n_samples] or None if all weights are equal
:return: self
"""
pass
@abstractmethod
[docs] def predict(self, X):
"""
Predict values for data.
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:rtype: numpy.array of shape [n_samples] with predicted values
"""
pass
@abstractmethod
[docs] def staged_predict(self, X):
"""
Predicts values for data on each stage (i.e. for boosting algorithms).
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:rtype: iterator
"""
pass
[docs] def fit_lds(self, lds):
"""
Train a regression model on the specific type of dataset.
:param LabeledDataStorage lds: data
:return: self
"""
X, y, sample_weight = lds.get_data(self.features), lds.get_targets(), lds.get_weights()
if sample_weight is None:
return self.fit(X, y)
else:
return self.fit(X, y, sample_weight=sample_weight)
[docs] def get_feature_importances(self):
"""
Get features importances.
:rtype: pandas.DataFrame with `index=self.features`
"""
try:
return pandas.DataFrame({"effect": self.feature_importances_}, index=self.features)
except AttributeError:
raise AttributeError("Classifier doesn't provide feature_importances_ property")
[docs] def test_on_lds(self, lds):
"""
Prepare a regression report for a single regressor.
:param LabeledDataStorage lds: data
:return: RegressionReport
"""
from ..report import RegressionReport
return RegressionReport(regressors={'clf': self}, lds=lds)
[docs] def test_on(self, X, y, sample_weight=None):
"""
Prepare a regression report for a single regressor
:param pandas.DataFrame X: data of shape [n_samples, n_features]
:param y: values of samples --- array-like of shape [n_samples]
:param sample_weight: weight of samples,
array-like of shape [n_samples] or None if all weights are equal
:return: RegressionReport
"""
from ..data import LabeledDataStorage
lds = LabeledDataStorage(data=X, target=y, sample_weight=sample_weight)
return self.test_on_lds(lds=lds)