Source code for rep.report.regression

"""
This file contains report class for regression estimators. Report includes:

    * features scatter plots, correlations
    * learning curve
    * feature importance
    * feature importance by shuffling the feature column

All methods return objects, which can have plot method (details see in :class:`rep.plotting`)
"""

from __future__ import division, print_function, absolute_import

from itertools import islice
from collections import OrderedDict
import itertools

from sklearn.metrics import mean_squared_error

from .. import plotting
from ..utils import get_columns_dict
from ._base import AbstractReport
from ..estimators.interface import Regressor

__author__ = 'Alex Rogozhnikov, Tatiana Likhomanenko'


[docs]class RegressionReport(AbstractReport):
    def __init__(self, regressors, lds):
        """
        Report simplifies comparison of regressors on the same dataset.

        :param regressors: OrderedDict with regressors (RegressionFactory)
        :type regressors: dict[str, Regressor]
        :param LabeledDataStorage lds: data
        """
        for name, regressor in regressors.items():
            assert isinstance(regressor, Regressor), "Object {} doesn't implement interface".format(name)
        AbstractReport.__init__(self, lds=lds, estimators=regressors)

    def _predict(self, estimator, X):
        return estimator.predict(X)

[docs]    def scatter(self, correlation_pairs, mask=None, marker_size=20, alpha=0.1, grid_columns=2):
        """
        Correlation between pairs of features

        :param list[tuple] correlation_pairs: pairs of features along which scatter plot will be build.
        :param mask: mask for data, which will be used
        :type mask: None or array-like or str or function(pandas.DataFrame)
        :param int marker_size: size of marker for each event on the plot
        :param float alpha: blending parameter for scatter
        :param int grid_columns: count of columns in grid

        :rtype: plotting.GridPlot
        """
        features = list(set(itertools.chain.from_iterable(correlation_pairs)))
        _, df, = self._apply_mask(mask, self._get_features(features))
        correlation_plots = self._scatter_addition(df, correlation_pairs, marker_size=marker_size, alpha=alpha)
        return plotting.GridPlot(grid_columns, *correlation_plots)

[docs]    def predictions_scatter(self, features=None, mask=None, marker_size=20, alpha=0.1, grid_columns=2):
        """
        Correlation between predictions and features

        :param features: using features (if None then use classifier's features)
        :type features: None or list[str]
        :param mask: mask for data, which will be used
        :type mask: None or array-like or str or function(pandas.DataFrame)
        :param int marker_size: size of marker for each event on the plot
        :param float alpha: blending parameter for scatter
        :param int grid_columns: count of columns in grid

        :rtype: plotting.GridPlot
        """
        features = self.common_features if features is None else features
        mask, df, = self._apply_mask(mask, self._get_features(features))
        correlation_plots = []
        for name, prediction in self.prediction.items():
            correlation_pairs = [(feature, name) for feature in features]
            df[name] = prediction[mask]
            correlation_plots += self._scatter_addition(df, correlation_pairs, marker_size=marker_size, alpha=alpha)
        return plotting.GridPlot(grid_columns, *correlation_plots)

    @staticmethod
    def _scatter_addition(df, correlation_pairs, marker_size=20, alpha=0.1):
        correlation_plots = []
        corr_pairs = OrderedDict()
        for feature1_c, feature2_c in correlation_pairs:
            feature1, feature2 = list(get_columns_dict([feature1_c, feature2_c]).keys())
            corr_pairs[(feature1, feature2)] = (df[feature1].values, df[feature2].values)
            plot_fig = plotting.ScatterPlot({'correlation': corr_pairs[(feature1, feature2)]}, alpha=alpha,
                                            size=marker_size)
            plot_fig.xlabel = feature1
            plot_fig.ylabel = feature2
            plot_fig.figsize = (8, 6)
            correlation_plots.append(plot_fig)
        return correlation_plots

    def _learning_curve_additional(self, name, metric_func, step, mask, predict_only_masked):
        """Returns values of roc curve for particular classifier, mask and metric function. """
        evaled_mask, labels, weight = self._apply_mask(mask, self.target, self.weight)
        data = self._get_features()
        if predict_only_masked:
            _, data = self._apply_mask(mask, data)

        curve = OrderedDict()
        stage_values = self.estimators[name].staged_predict(data)
        for stage, prediction in islice(enumerate(stage_values), step - 1, None, step):
            if not predict_only_masked:
                prediction = prediction[evaled_mask]
            curve[stage] = metric_func(labels, prediction, sample_weight=weight)
        return list(curve.keys()), list(curve.values())

[docs]    def feature_importance_shuffling(self, metric=mean_squared_error, mask=None, grid_columns=2):
        """
        Get features importance using shuffling method (apply random permutation to one particular column)

        :param metric: function to measure quality
            function(y_true, y_predicted, sample_weight=None)
        :param mask: mask which points we should compare on
        :type mask: None or numbers.Number or array-like or str or function(pandas.DataFrame)
        :param int grid_columns: number of columns in grid
        :rtype: plotting.GridPlot
        """
        return self._feature_importance_shuffling(metric=metric, mask=mask, grid_columns=grid_columns)