Source code for rep.metaml.stacking

"""
:class:`FeatureSplitter`  defined in this module.

This meta-algorithm is handy to train different models for subsets of the data
without manually splitting the data into parts.

"""
from __future__ import division, print_function, absolute_import

import numpy
from sklearn.base import clone

from ..estimators import Classifier
from ..estimators.utils import check_inputs, _get_features


__author__ = 'Alex Rogozhnikov'


[docs]class FeatureSplitter(Classifier): """ Dataset is split by values of `split_feature`, for each value of feature, new classifier is trained. When building predictions, classifier predicts the events with the same value of `split_feature` it was trained on. :param str split_feature: the name of key feature :param base_estimator: the classifier, its' copies are trained on parts of dataset :param list[str] train_features: list of columns classifier uses in training """ def __init__(self, split_feature, base_estimator, train_features=None): self.base_estimator = base_estimator self.split_feature = split_feature self.train_features = train_features Classifier.__init__(self, features=self._features()) def _features(self): if self.train_features is None: return None else: return list(self.train_features) + [self.split_feature] def _get_features(self, X, allow_nans=False): """ :param pandas.DataFrame X: train dataset :return: pandas.DataFrame with used features """ split_column_values, _ = _get_features([self.split_feature], X, allow_nans=allow_nans) split_column_values = numpy.ravel(numpy.array(split_column_values)) X_prepared, self.train_features = _get_features(self.train_features, X, allow_nans=allow_nans) self.features = self._features() return split_column_values, X_prepared
[docs] def fit(self, X, y, sample_weight=None): """ Fit dataset. :param X: pandas.DataFrame of shape [n_samples, n_features] with features :param y: array-like of shape [n_samples] with targets :param sample_weight: array-like of shape [n_samples] with events weights or None. :return: self """ if hasattr(self.base_estimator, 'features'): assert self.base_estimator.features is None, 'Base estimator must have None features! ' \ 'Use features parameter in Folding to fix it' X, y, sample_weight = check_inputs(X, y, sample_weight=sample_weight, allow_none_weights=True) # TODO cover the case of missing labels in subsets. split_column_values, X = self._get_features(X) self._set_classes(y) self.base_estimators = {} for value in numpy.unique(split_column_values): rows = numpy.array(split_column_values) == value base_classifier = clone(self.base_estimator) if sample_weight is None: base_classifier.fit(X.iloc[rows, :], y[rows]) else: base_classifier.fit(X.iloc[rows, :], y[rows], sample_weight=sample_weight[rows]) self.base_estimators[value] = base_classifier return self
[docs] def predict_proba(self, X): """ Predict probabilities. Each event is predicted by the classifier trained on corresponding value of `split_feature` :param X: pandas.DataFrame of shape [n_samples, n_features] :return: probabilities of shape [n_samples, n_classes] """ split_column_values, X = self._get_features(X) result = numpy.zeros([len(X), self.n_classes_]) for value, estimator in self.base_estimators.items(): mask = split_column_values == value result[mask, :] = estimator.predict_proba(X.loc[mask, :]) return result
[docs] def staged_predict_proba(self, X): """ Predict probabilities after each stage of base classifier. Each event is predicted by the classifier trained on corresponding value of `split_feature` :param X: pandas.DataFrame of shape [n_samples, n_features] :return: iterable sequence of numpy.arrays of shape [n_samples, n_classes] """ split_column_values, X = self._get_features(X) result = numpy.zeros([len(X), self.n_classes_]) masks_iterators = [] for value, estimator in self.base_estimators.items(): mask = split_column_values == value prediction_iterator = estimator.staged_predict_proba(X.loc[mask, :]) masks_iterators.append([mask, prediction_iterator]) try: while True: for mask, prediction_iterator in masks_iterators: result[mask, :] = next(prediction_iterator) yield result except StopIteration: pass