Source code for rep.report._base

from __future__ import division, print_function, absolute_import
from abc import ABCMeta, abstractmethod
import numpy
import pandas
import copy
from collections import OrderedDict
from .. import plotting
from .. import utils

__author__ = 'Alex Rogozhnikov, Tatiana Likhomanenko'


class AbstractReport:
    """
    Provides methods used both in Classification and Regression reports

    Parameters:
    -----------
    :type lds: rep.data.storage.LabeledDataStorage
    :type estimators: dict[str, Classifier] or dict[str, Regressor]
    """
    __metaclass__ = ABCMeta

    def __init__(self, estimators, lds):
        self.lds = lds
        if not isinstance(estimators, OrderedDict):
            estimators = OrderedDict(estimators)

        self.estimators = estimators

        self.prediction = OrderedDict()
        X = lds.get_data()
        for name, estimator in self.estimators.items():
            self.prediction[name] = self._predict(estimator, X)

        self.target, self.weight = lds.get_targets(), lds.get_weights()

        self.common_features = list(
            set.intersection(*[set(estimator.features) for name, estimator in self.estimators.items()]))

    @abstractmethod
    def _predict(self, estimator, X):
        """Returns probabilities for estimators and predictions for regressors"""
        pass

    def _apply_mask(self, mask, *args):
        if mask is None:
            return tuple([numpy.ones(len(self.lds), dtype=bool)] + list(args))
        mask = self.lds.eval_column(mask)
        mask_data = [data.iloc[mask, :] if isinstance(data, pandas.DataFrame) else data[mask] for data in args]
        return tuple([mask] + mask_data)

    def _get_features(self, features=None):
        return self.lds.get_data(features=features)

    def features_correlation_matrix(self, features=None, mask=None, tick_labels=None, vmin=-1, vmax=1, cmap='Reds'):
        """
        Correlation between features

        :param features: using features (if None then use estimator's features)
        :type features: None or list[str]
        :param mask: mask for data, which will be used
        :type mask: None or numbers.Number or array-like or str or function(pandas.DataFrame)
        :param tick_labels: names for features in matrix
        :type tick_labels: None or array-like
        :param int vmin: min of value for min color
        :param int vmax: max of value for max color
        :param str cmap: color map name
        :rtype: plotting.ColorMap
        """
        features = self.common_features if features is None else features
        _, df, = self._apply_mask(mask, self._get_features(features))
        features_names = list(df.columns)
        if tick_labels is None:
            tick_labels = features_names

        assert len(tick_labels) == len(features_names), 'Tick labels and features have different length'
        plot_corr = plotting.ColorMap(
            utils.calc_feature_correlation_matrix(df[features_names]),
            labels=tick_labels, vmin=vmin, vmax=vmax, cmap=cmap)
        plot_corr.title = 'Correlation'
        plot_corr.fontsize = 10
        plot_corr.figsize = (len(features) // 5 + 2, len(features) // 5)

        return plot_corr

    def learning_curve(self, metric, mask=None, steps=10, metric_label='metric', predict_only_masked=True):
        """
        Get learning curves

        :param function metric: function looks like function
            def function(y_true, y_pred, sample_weight=None)
        :param steps: if int, the same step is used in all learning curves,
            otherwise dict with steps for each estimator
        :type steps: int or dict
        :param str metric_label: name for metric on plot
        :param bool predict_only_masked: if True, will predict only for needed events.
          When you build learning curves for FoldingClassifier/FoldingRegressor on the same dataset,
          set this to False to get unbiased predictions.

        :rtype: plotting.FunctionsPlot
        """
        mask, data, labels, weight = self._apply_mask(mask, self._get_features(), self.target, self.weight)

        if isinstance(metric, type):
            print(metric_label, ' is a type, not instance. Forgot to initialize?')

        metric_func = copy.copy(metric)
        utils.fit_metric(metric_func, data, labels, sample_weight=weight)

        quality = OrderedDict()
        for estimator_name in self.prediction:
            if isinstance(steps, int):
                step = steps
            else:
                step = steps[estimator_name]
            try:
                quality[estimator_name] = self._learning_curve_additional(estimator_name, metric_func, step, mask,
                                                                          predict_only_masked=predict_only_masked)
            except (AttributeError, NotImplementedError):
                print("Estimator {} doesn't support stage predictions".format(estimator_name))
        plot_fig = plotting.FunctionsPlot(quality)
        plot_fig.xlabel = 'stage'
        plot_fig.ylabel = '{}'.format(metric_label)
        plot_fig.title = 'Learning curves'
        return plot_fig

    def _learning_curve_additional(self, name, metric_func, step, mask, predict_only_masked):
        """ returns tuple (x_values, quality_values), which describe the learning curve """
        raise NotImplementedError('Should be implemented in descendants')

    def feature_importance(self, grid_columns=2):
        """
        Get features importance

        :param int grid_columns: count of columns in grid
        :rtype: plotting.GridPlot
        """
        importance_plots = []
        for name, estimator in self.estimators.items():
            try:
                df = estimator.get_feature_importances()
                df = {column: dict(df[column]) for column in df.columns}
                plot = plotting.BarComparePlot(df, sortby='effect')
                plot.title = 'Feature importance for %s' % name
                plot.fontsize = 10
                importance_plots.append(plot)
            except AttributeError:
                print("Estimator {} doesn't support feature importances".format(name))
        return plotting.GridPlot(grid_columns, *importance_plots)

    def _feature_importance_shuffling(self, metric, mask=None, grid_columns=2):
        """
        Get features importance using shuffling method (apply random permutation to one particular column)

        :param metric: function to measure quality
            function(y_true, proba, sample_weight=None)
        :param mask: mask which points we should use
        :type mask: None or array-like or str or function(pandas.DataFrame)
        :param int grid_columns: number of columns in grid
        :rtype: plotting.GridPlot
        """
        importances_plots = []
        for name, estimator in self.estimators.items():
            result = dict()
            _, data, labels, weights = self._apply_mask(mask, self._get_features(estimator.features), self.target,
                                                        self.weight)
            metric_copy = copy.deepcopy(metric)
            utils.fit_metric(metric_copy, data, labels, sample_weight=weights)

            for feature in data.columns:
                data_modified = data.copy()
                column = numpy.array(data_modified[feature])
                numpy.random.shuffle(column)
                data_modified[feature] = column
                predictions = self._predict(estimator, data_modified)
                result[feature] = metric_copy(labels, predictions, sample_weight=weights)

            plot_fig = plotting.BarComparePlot({name: result}, sortby=name)
            plot_fig.title = 'Feature importance for %s' % name
            plot_fig.fontsize = 10
            importances_plots.append(plot_fig)
        return plotting.GridPlot(grid_columns, *importances_plots)

    def compute_metric(self, metric, mask=None):
        """
        Compute metric value

        :param metric: function like object with::

            __call__(self, y_true, prob, sample_weight=None)

        :param mask: mask, points we should use
        :type mask: None or array-like or str or function(pandas.DataFrame)

        :return: metric value for each estimator
        """
        mask, data, labels, weight = self._apply_mask(mask, self._get_features(), self.target, self.weight)

        if isinstance(metric, type):
            print('Metric is a type, not instance. Forgot to initialize?')
        metric_func = copy.copy(metric)
        utils.fit_metric(metric_func, data, labels, sample_weight=weight)

        quality = OrderedDict()
        for estimator_name, prediction in self.prediction.items():
            quality[estimator_name] = metric_func(labels, prediction[mask], sample_weight=weight)
        return quality