Source code for rep.data.storage

"""
This is a wrapper for `pandas.DataFrame`, which allows you to define dataset (data, labels/values, sample weights) for an estimator in a simple way.
"""
from __future__ import division, print_function, absolute_import
import numbers

from numpy.random.mtrand import RandomState
import pandas
import numpy
from sklearn.utils import check_random_state

from ..utils import get_columns_dict, get_columns_in_df


# generating random seeds in the interval [0, RANDINT)
RANDINT = 10000000


[docs]class LabeledDataStorage(object): def __init__(self, data, target=None, sample_weight=None, random_state=None, shuffle=False): """ This class implements an interface of data for estimators training. It contains data, labels/values and weights - all information to train a model. :param pandas.DataFrame data: features, array-like of shape [n_samples, n_features] :param target: labels/values for classification/regression (set None for the predictive methods) :type target: None or numbers.Number or array-like :param sample_weight: weight (set None for predictive methods) :type sample_weight: None or numbers.Number or array-like :param random_state: state for a pseudo random generator :type random_state: None or int or RandomState :param bool shuffle: shuffle data or not """ self.data = data self.target = self._get_key(self.data, target) self.sample_weight = self._get_key(self.data, sample_weight, allow_nones=True) assert len(self.data) == len(self.target), 'ERROR: Lengths are different for data and target' if self.sample_weight is not None: assert len(self.data) == len(self.sample_weight), 'ERROR: Lengths are different for data and sample_weight' self._random_state = check_random_state(random_state).randint(RANDINT) self.shuffle = shuffle self._indices = None def _get_key(self, ds, key, allow_nones=False): """ Get data from the storage by key. :param pandas.DataFrame ds: data :param key: key, which describe data in the storage :type key: None or numbers.Number or array-like :return: data corresponding to the key """ if isinstance(key, str) and ds is not None: # assert key in set(ds.columns), self._print_err('ERROR:', '%s is absent in data storage' % key) name = list(get_columns_dict([key]).keys())[0] return numpy.array(get_columns_in_df(self.data, [key])[name]) elif isinstance(key, numbers.Number): return numpy.array([key] * len(ds)) else: if not allow_nones: return numpy.array(key) if key is not None else numpy.ones(len(ds)) else: return numpy.array(key) if key is not None else key def __len__(self): """ Return number of samples. :return: count of rows in the storage :rtype: int """ return len(self.data)
[docs] def get_data(self, features=None): """ Return data. :param features: set of feature names (if None then use all features in data storage) :type features: None or list[str] :rtype: pandas.DataFrame """ df = get_columns_in_df(self.data, features) if self.shuffle: return df.irow(self.get_indices()) return df
[docs] def get_targets(self): """ Return sample target, labels or values. :rtype: numpy.array """ if self.shuffle: return self.target[self.get_indices()] return self.target
[docs] def get_weights(self, allow_nones=False): """ Return sample weights. :rtype: numpy.array """ if self.sample_weight is None: if allow_nones: return self.sample_weight else: return numpy.ones(len(self.data)) else: if self.shuffle: return self.sample_weight[self.get_indices()] return self.sample_weight
[docs] def get_indices(self): """ Return data indices. :rtype: numpy.array """ if self._indices is None: rs = RandomState(seed=self._random_state) self._indices = rs.permutation(len(self)) return self._indices
[docs] def col(self, index): """ Return column from the data. :param index: names :type index: None or str or list(str) :rtype: pandas.Series or pandas.DataFrame """ if isinstance(index, str): name = list(get_columns_dict([index]).keys())[0] return self.get_data([index])[name] return self.get_data(index)
[docs] def eval_column(self, expression): """ Evaluate some expression to obtain necessary columns for the data :type expression: numbers.Number or array-like or str or function(pandas.DataFrame) :rtype: numpy.array or str or """ if isinstance(expression, numbers.Number): return numpy.zeros(len(self), dtype=type(expression)) + expression elif isinstance(expression, str): return numpy.array(self.col(expression)) elif hasattr(expression, '__call__'): return numpy.array(expression(self.get_data())) else: assert len(expression) == len(self), 'Different length' return numpy.array(expression)