Source code for menelaus.injection.label_manipulation

import numpy as np

from menelaus.injection.injector import Injector


# region - Simple Label Manipulation


[docs]class LabelSwapInjector(Injector): """ Swaps two classes in a target column of a given dataset with each other. Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray`` with column indices. Ref. :cite:t:`souza2020challenges` """ def __call__(self, data, from_index, to_index, target_col, class_1, class_2): """ Args: data (np.array): data to inject with drift from_index: row index at which to start class swap to_index: row index at which to end (non-inclusive) class swap target_col (int or str): column index/name of targets column class_1 (int): value of first label in class swap class_2 (int): value of second label in class swap Returns: np.array or pd.DataFrame: copy of data, with two classes swapped in given target column, over given indices """ # handle data type ret, (target_col,) = self._preprocess(data, target_col) # locate two classes, perform swap class_1_idx = np.where(ret[:, target_col] == class_1)[0] class_1_idx = class_1_idx[ (class_1_idx < to_index) & (class_1_idx >= from_index) ] class_2_idx = np.where(ret[:, target_col] == class_2)[0] class_2_idx = class_2_idx[ (class_2_idx < to_index) & (class_2_idx >= from_index) ] ret[class_1_idx, target_col] = class_2 ret[class_2_idx, target_col] = class_1 # handle data type and return ret = self._postprocess(ret) return ret
[docs]class LabelJoinInjector(Injector): """ Joins two [TODO or more?] classes in a unique class. Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray`` with column indices. Ref. :cite:t:`souza2020challenges` """ def __call__( self, data, from_index, to_index, target_col, class_1, class_2, new_class ): """ Args: data (np.array): data to inject with drift from_index: row index at which to start class join to_index: row index at which to end (non-inclusive) class join target_col (int or str): column index/name of targets column class_1 (int): value of first label in class join class_2 (int): value of second label in class join, new_class (int): new label value to assign to old classes Returns: np.array or pd.DataFrame: copy of data, with two classes joined in given target column, over given indices, into new class """ # handle data type ret, (target_col,) = self._preprocess(data, target_col) # locate two labels, switch both to new label class_idx = np.where( (ret[:, target_col] == class_1) | (ret[:, target_col] == class_2) )[0] class_idx = class_idx[(class_idx < to_index) & (class_idx >= from_index)] ret[class_idx, target_col] = new_class # handle data type and return ret = self._postprocess(ret) return ret
# endregion # region - LTF-Inspired Label Manipulation
[docs]class LabelProbabilityInjector(Injector): """ Resamples the data over a specified window, with altered probability for specified classes (and uniform probability for remaining classes). Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray`` with column indices. Note: * this function can perform tweak-one and minority shift * When a class is not present in the window specified, but specified in ``class_probabilities``, the probability value is uniformly divided into the remaining classes in the window. Ref. :cite:t:`LTFmethods` """ def __call__(self, data, from_index, to_index, target_col, class_probabilities): """ Args: data (np.array): data to inject with drift from_index (int): row index at which to start class swap to_index (int): row index at which to end (non-inclusive) class swap target_col (int or str): column index/name of targets column class_probabilities (dict): classes as keys, and their desired resampling chance as values. Un-specified classes are given a uniform resampling chance with respect to all other un-specified classes Returns: np.array or pd.DataFrame: copy of data, resampled with shifted class probability for 1 or more desired classes """ # handle data type ret, (target_col,) = self._preprocess(data, target_col) # determine all unique classes and classes not specified in args all_classes = np.unique(ret[:, target_col]) undefined_classes = [k for k in all_classes if k not in class_probabilities] # specified class probabilities must sum to 1 or less if sum(class_probabilities.values()) > 1.0: raise ValueError(f"Probabilities in {class_probabilities} exceed 1") # args should not specify previously unseen classes if set(all_classes) != set( list(class_probabilities.keys()) + undefined_classes ): raise ValueError( f"Argument {class_probabilities} has classes not found in data {all_classes}" ) # undefined classes are resampled uniformly missing_probability = 1 - sum(class_probabilities.values()) for uc in undefined_classes: class_probabilities[uc] = missing_probability / len(undefined_classes) # distribution for each data point, and reordering of each point by class self._p_distribution = [] sample_idxs_grouped = [] # locate each class in window for cls in all_classes: cls_idx = np.where(ret[:, target_col] == cls)[0] cls_idx = cls_idx[(cls_idx < to_index) & (cls_idx >= from_index)] # each member has p_class / class_size chance, represented as bool to avoid div/0 p_individual = ( cls_idx.shape[0] and class_probabilities[cls] / cls_idx.shape[0] ) or 0 # append to grouped array and corresponding distribution sample_idxs_grouped.extend(cls_idx) self._p_distribution.extend(np.ones(cls_idx.shape[0]) * p_individual) # if classes skipped, ensure probability distribution adds to 1 p_leftover = (1 - sum(self._p_distribution)) / len(self._p_distribution) self._p_distribution = [p + p_leftover for p in self._p_distribution] # shuffled sample over window, with replacement, with weights sample_idxs = np.random.choice( sample_idxs_grouped, to_index - from_index, True, self._p_distribution ) ret[from_index:to_index] = ret[sample_idxs] # handle data type and return ret = self._postprocess(ret) return ret
[docs]class LabelDirichletInjector(Injector): """ Resamples the data over a specified window, per a generated Dirichlet distribution (with specified alpha) over all labels. Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray`` with column indices. Notes: * If all labels are not given weights, unexpected behavior may cause all un-specified classes to be given uniform resampling chance. Ref. :cite:t:`LTFmethods` """ def __call__(self, data, from_index, to_index, target_col, alpha): """ Args: data (np.array): data to inject with drift from_index: row index at which to start class swap to_index: row index at which to end (non-inclusive) class swap target_col (int or str): column index/name of targets column alpha (dict): used to derive alpha parameter for Dirichlet distribution. Keys are ALL labels, values are the desired average weight (typically ``int``) per label when resampling. For example, weights of [4,1] correspond to an expected 80/20 percent split between first and second classes. Returns: np.array or pd.DataFrame: copy of data, resampled per Dirichlet distribution over classes with specified alpha """ # weights dictionary into lists of keys and values self._alpha_classes = list(alpha.keys()) self._alpha_values = [alpha[k] for k in alpha] # generate dirichlet distribution by class # XXX - minor concern that order of these list-types not always guaranteed self._dirichlet_distribution = np.random.dirichlet(self._alpha_values) self._dirichlet_probabilities = { self._alpha_classes[i]: self._dirichlet_distribution[i] for i in range(len(self._alpha_classes)) } # use class_probability_shift with fully-specified distribution label_prob_injector = LabelProbabilityInjector() return label_prob_injector( data, from_index=from_index, to_index=to_index, target_col=target_col, class_probabilities=self._dirichlet_probabilities, )
# endregion