Source code for rep.metaml.cache

"""

In many cases training a classification/regression takes hours.
To avoid retraining at each step, one can store trained classifier in a file,
and later load trained model.

However, in this case user should care about situations when something changed in the pipeline
(for instance, train/test splitting) manually.

Cache estimators are lazy way to store trained model.
After training, classifier/regressor is stored in the file under specific name (which was passed in constructor).

On the next runs following conditions are checked:

* model has the same name
* model trained has exactly same parameters
* model is trained using exactly the same data
* stored copy in not too old (10 days by default)

If all the conditions satisfied, stored copy is loaded, otherwise classifier/regressor is fitted.

Example of usage
----------------

:class:`CacheClassifier` and :class:`CacheRegressor` work as meta-estimators

>>> from rep.estimators import XGBoostClassifier
>>> from rep.metaml import FoldingClassifier
>>> from rep.metaml.cache import CacheClassifier
>>> clf = CacheClassifier('xgboost folding', FoldingClassifier(XGBoostClassifier(), n_folds=3))
>>> # this works normally
>>> clf.fit(X, y, sample_weight)
>>> clf.predict_proba(testX)

However in the following situation:

>>> clf = FoldingClassifier(CacheClassifier('xgboost', XGBoostClassifier()))

cache is not going to work, because for each fold a copy of classifier is created.
Each time after looking at cache, a version with same parameters, but different data will be found.

So, every time stored copy will be erased and a new one saved.


By default, cache is stored in '.cache/rep' subfolder of project directory (where the ipython notebook is placed).
To change parameters of caching use:

>>> import rep.metaml.cache
>>> from rep.metaml._cache import CacheHelper
>>> rep.metaml.cache.cache_helper = CacheHelper(folder, expiration_in_seconds)
>>> # to delete all cached items, use:
>>> rep.metaml.cache.cache_helper.clear_cache()

"""

from __future__ import division, print_function, absolute_import

__author__ = 'Alex Rogozhnikov'

from ..estimators.interface import Classifier, Regressor
from ._cache import CacheHelper
from ..estimators import SklearnClassifier, SklearnRegressor
import hashlib
from six.moves import cPickle
from sklearn.base import ClassifierMixin, RegressorMixin

# To change cache parameters use
cache_helper = CacheHelper(folder='./.cache/rep',
                           expiration_in_seconds=10 * 24 * 60 * 60,  # 10 days
                           )

__all__ = ['CacheClassifier', 'CacheRegressor']


class CacheBase(object):
    def __init__(self, name, clf, features=None):
        """
        Cache {estimator} allows to save trained models in lazy way.
        Useful when training {estimator} takes much time.

        On the next run, stored model in cache will be used instead of fitting again.

        :param name: unique name of classifier (to be used in storing)
        :param sklearn.BaseEstimator clf: your estimator, which will be used for training
        :param features: features to use in training.
        """
        self.features = features
        self.name = name
        self.clf = clf
        self._used_cache = None

    def _fit(self, X, y, **kwargs):
        """
        Train the {estimator}

        :param pandas.DataFrame X: data shape [n_samples, n_features]
        :param y: target of training - array-like of shape [n_samples]
        :param sample_weight: weights of events,
               array-like of shape [n_samples] or None if all weights are equal

        :return: self
        """
        X = self._get_features(X)
        parameters = self.clf, X, y, sorted(kwargs)
        hash_value = hashlib.sha1(cPickle.dumps(parameters)).hexdigest()
        is_found, trained_clf = cache_helper.get_from_cache(self.name, hash_value)
        if is_found:
            print('Found a trained copy, skipped fitting.')
            self.clf = trained_clf
            self._used_cache = True
        else:
            print('Not found in the cache (previous version may have expired). Fitting.')
            self.clf.fit(X, y, **kwargs)
            cache_helper.store_in_cache(self.name, hash_value, self.clf)
            self._used_cache = False
        return self

    def set_params(self, **params):
        """
        Set the parameters of this estimator.
        Parameters of base estimator can be accessed (for example param `depth`) by both *depth* and *clf__depth*.

        :param dict params: parameters to set in model
        """
        params_for_clf = {}
        for name, value in params.items():
            if name == 'features':
                self.features = value
            elif name == 'name':
                self.name = value
            elif name == 'clf':
                self.clf = value
            elif name.startswith('clf__'):
                params_for_clf[name[5:]] = value
            else:
                params_for_clf[name] = value
        self.clf.set_params(**params_for_clf)


[docs]class CacheClassifier(CacheBase, SklearnClassifier):
    def __init__(self, name, clf, features=None):
        if not isinstance(clf, ClassifierMixin):
            raise ValueError('passed model should be derived from ClassifierMixin!')

        CacheBase.__init__(self, name=name, clf=clf, features=features)
        Classifier.__init__(self, features=features)

    __init__.__doc__ = CacheBase.__init__.__doc__.format(estimator='classifier')


[docs]class CacheRegressor(CacheBase, SklearnRegressor):
    def __init__(self, name, clf, features=None):
        if not isinstance(clf, RegressorMixin):
            raise ValueError('passed model should be derived from RegressorMixin!')

        CacheBase.__init__(self, name=name, clf=clf, features=features)
        Regressor.__init__(self, features=features)

    __init__.__doc__ = CacheBase.__init__.__doc__.format(estimator='regressor')