Source code for menelaus.injection.feature_manipulation

import numpy as np

from menelaus.injection.injector import Injector


[docs]class FeatureShiftInjector(Injector): """ Shifts a column in a dataset by a fixed value relative to the current mean. Formula: * ``column = column + (shift_factor * (alpha + mean_column))`` The alpha is a small value used to inject drift even if the mean is 0. """ def __call__(self, data, from_index, to_index, col, shift_factor, alpha=0.001): """ Args: data (np.ndarray or pd.DataFrame): data to inject with drift from_index (int): row index at which to start shift to_index (int): row index at which to end (non-inclusive) shift col (int or str): column index/name of column to shift shift_factor (float): percentage of mean by which to shift data alpha (float): small initial value to add to shift amount, in case mean is 0. Default 0.001 Returns: np.ndarray or pd.DataFrame: copy of data, with two columns swapped over given indices """ # handle type ret, (col,) = self._preprocess(data, col) # add shift self._section_mean = np.mean(ret[from_index:to_index, col]) self._delta = (alpha + self._section_mean) * shift_factor ret[from_index:to_index, col] = np.add( ret[from_index:to_index, col], self._delta ) # handle type and return ret = self._postprocess(ret) return ret
[docs]class FeatureSwapInjector(Injector): """ Swaps two features/columns of a given dataset with each other. Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray`` with column indices. Ref. :cite:t:`souza2020challenges` """ def __call__(self, data, from_index, to_index, col_1, col_2): """ Args: data (np.ndarray or pd.DataFrame): data to inject with drift from_index (int): row index at which to start column swap to_index (int): row index at which to end (non-inclusive) column swap col_1 (int or str): column index/name of first column col_2 (int or str): column index/name of second column Returns: np.ndarray or pd.DataFrame: copy of data, with two columns swapped over given indices """ # handle type ret, (col_1, col_2) = self._preprocess(data, col_1, col_2) # swap columns ret[from_index:to_index, [col_1, col_2]] = ret[ from_index:to_index, [col_2, col_1] ] # handle type and return ret = self._postprocess(ret) return ret
[docs]class FeatureCoverInjector(Injector): """ Hides a feature, then treats it as a shared concept by which to group the data. Afterwards samples are uniformly drawn from each group. Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray`` with column indices. Note: * This function cannot hide a feature over a window, as the appearance of the column outside the window may raise errors. Ref. :cite:t:`souza2020challenges` """ def __call__(self, data, col, sample_size, random_state=None): """ Args: data (np.ndarray or pd.DataFrame): data to inject with drift col (int or str): index/label of column to hide and re-sample (note this should be a categorical feature that can be treated as a concept) sample_size (int): data points to be drawn from each group in new concept random_state (int): optional random seed. Default 0 Returns: np.ndarray or pd.DataFrame: copy of data, grouped by indicated column, with each group sampled and column removed """ # handle type ret, (col,) = self._preprocess(data, col, return_df=True) # hide and reorder n = sample_size // len(ret[col].unique()) ret = ret.groupby(col).sample(n=n, random_state=random_state) ret = ret.drop(columns=[col]).reset_index(drop=True) # handle type and return ret = self._postprocess(ret) return ret