Source code for menelaus.injection.label_manipulation

import numpy as np

from menelaus.injection.injector import Injector


# region - Simple Label Manipulation


[docs]class LabelSwapInjector(Injector):
    """
    Swaps two classes in a target column of a given dataset with each other.
    Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray``
    with column indices.

    Ref. :cite:t:`souza2020challenges`
    """

    def __call__(self, data, from_index, to_index, target_col, class_1, class_2):
        """
        Args:
            data (np.array): data to inject with drift
            from_index: row index at which to start class swap
            to_index: row index at which to end (non-inclusive) class swap
            target_col (int or str): column index/name of targets column
            class_1 (int): value of first label in class swap
            class_2 (int): value of second label in class swap

        Returns:
            np.array or pd.DataFrame: copy of data, with two classes swapped
                in given target column, over given indices
        """
        # handle data type
        ret, (target_col,) = self._preprocess(data, target_col)

        # locate two classes, perform swap
        class_1_idx = np.where(ret[:, target_col] == class_1)[0]
        class_1_idx = class_1_idx[
            (class_1_idx < to_index) & (class_1_idx >= from_index)
        ]
        class_2_idx = np.where(ret[:, target_col] == class_2)[0]
        class_2_idx = class_2_idx[
            (class_2_idx < to_index) & (class_2_idx >= from_index)
        ]
        ret[class_1_idx, target_col] = class_2
        ret[class_2_idx, target_col] = class_1

        # handle data type and return
        ret = self._postprocess(ret)
        return ret


[docs]class LabelJoinInjector(Injector):
    """
    Joins two [TODO or more?] classes in a unique class. Accepts
    ``pandas.DataFrame`` with column names or ``numpy.ndarray``
    with column indices.

    Ref. :cite:t:`souza2020challenges`
    """

    def __call__(
        self, data, from_index, to_index, target_col, class_1, class_2, new_class
    ):
        """
        Args:
            data (np.array): data to inject with drift
            from_index: row index at which to start class join
            to_index: row index at which to end (non-inclusive) class join
            target_col (int or str): column index/name of targets column
            class_1 (int): value of first label in class join
            class_2 (int): value of second label in class join,
            new_class (int): new label value to assign to old classes

        Returns:
            np.array or pd.DataFrame: copy of data, with two classes joined
                in given target column, over given indices, into new class
        """
        # handle data type
        ret, (target_col,) = self._preprocess(data, target_col)

        # locate two labels, switch both to new label
        class_idx = np.where(
            (ret[:, target_col] == class_1) | (ret[:, target_col] == class_2)
        )[0]
        class_idx = class_idx[(class_idx < to_index) & (class_idx >= from_index)]
        ret[class_idx, target_col] = new_class

        # handle data type and return
        ret = self._postprocess(ret)
        return ret


# endregion

# region - LTF-Inspired Label Manipulation


[docs]class LabelProbabilityInjector(Injector):
    """
    Resamples the data over a specified window, with altered probability
    for specified classes (and uniform probability for remaining classes).
    Accepts ``pandas.DataFrame`` with column names or ``numpy.ndarray``
    with column indices.

    Note:
    * this function can perform tweak-one and minority shift
    * When a class is not present in the window specified,
    but specified in ``class_probabilities``, the probability
    value is uniformly divided into the remaining classes in
    the window.

    Ref. :cite:t:`LTFmethods`
    """

    def __call__(self, data, from_index, to_index, target_col, class_probabilities):
        """
        Args:
            data (np.array): data to inject with drift
            from_index (int): row index at which to start class swap
            to_index (int): row index at which to end (non-inclusive) class swap
            target_col (int or str): column index/name of targets column
            class_probabilities (dict): classes as keys, and their desired
                resampling chance as values. Un-specified classes are given
                a uniform resampling chance with respect to all other
                un-specified classes

        Returns:
            np.array or pd.DataFrame: copy of data, resampled with shifted
                class probability for 1 or more desired classes
        """
        # handle data type
        ret, (target_col,) = self._preprocess(data, target_col)

        # determine all unique classes and classes not specified in args
        all_classes = np.unique(ret[:, target_col])
        undefined_classes = [k for k in all_classes if k not in class_probabilities]

        # specified class probabilities must sum to 1 or less
        if sum(class_probabilities.values()) > 1.0:
            raise ValueError(f"Probabilities in {class_probabilities} exceed 1")

        # args should not specify previously unseen classes
        if set(all_classes) != set(
            list(class_probabilities.keys()) + undefined_classes
        ):
            raise ValueError(
                f"Argument {class_probabilities} has classes not found in data {all_classes}"
            )

        # undefined classes are resampled uniformly
        missing_probability = 1 - sum(class_probabilities.values())
        for uc in undefined_classes:
            class_probabilities[uc] = missing_probability / len(undefined_classes)

        # distribution for each data point, and reordering of each point by class
        self._p_distribution = []
        sample_idxs_grouped = []

        # locate each class in window
        for cls in all_classes:

            cls_idx = np.where(ret[:, target_col] == cls)[0]
            cls_idx = cls_idx[(cls_idx < to_index) & (cls_idx >= from_index)]

            # each member has p_class / class_size chance, represented as bool to avoid div/0
            p_individual = (
                cls_idx.shape[0] and class_probabilities[cls] / cls_idx.shape[0]
            ) or 0

            # append to grouped array and corresponding distribution
            sample_idxs_grouped.extend(cls_idx)
            self._p_distribution.extend(np.ones(cls_idx.shape[0]) * p_individual)

        # if classes skipped, ensure probability distribution adds to 1
        p_leftover = (1 - sum(self._p_distribution)) / len(self._p_distribution)
        self._p_distribution = [p + p_leftover for p in self._p_distribution]

        # shuffled sample over window, with replacement, with weights
        sample_idxs = np.random.choice(
            sample_idxs_grouped, to_index - from_index, True, self._p_distribution
        )
        ret[from_index:to_index] = ret[sample_idxs]

        # handle data type and return
        ret = self._postprocess(ret)
        return ret


[docs]class LabelDirichletInjector(Injector):
    """
    Resamples the data over a specified window, per a generated Dirichlet
    distribution (with specified alpha) over all labels. Accepts
    ``pandas.DataFrame`` with column names or ``numpy.ndarray``
    with column indices.

    Notes:
    * If all labels are not given weights, unexpected behavior may
    cause all un-specified classes to be given uniform resampling chance.

    Ref. :cite:t:`LTFmethods`
    """

    def __call__(self, data, from_index, to_index, target_col, alpha):
        """
        Args:
            data (np.array): data to inject with drift
            from_index: row index at which to start class swap
            to_index: row index at which to end (non-inclusive) class swap
            target_col (int or str): column index/name of targets column
            alpha (dict): used to derive alpha parameter for Dirichlet
                distribution. Keys are ALL labels, values are the desired
                average weight (typically ``int``) per label when resampling.
                For example, weights of [4,1] correspond to an expected 80/20
                percent split between first and second classes.

        Returns:
            np.array or pd.DataFrame: copy of data, resampled per Dirichlet
                distribution over classes with specified alpha
        """
        # weights dictionary into lists of keys and values
        self._alpha_classes = list(alpha.keys())
        self._alpha_values = [alpha[k] for k in alpha]

        # generate dirichlet distribution by class
        # XXX - minor concern that order of these list-types not always guaranteed
        self._dirichlet_distribution = np.random.dirichlet(self._alpha_values)
        self._dirichlet_probabilities = {
            self._alpha_classes[i]: self._dirichlet_distribution[i]
            for i in range(len(self._alpha_classes))
        }

        # use class_probability_shift with fully-specified distribution
        label_prob_injector = LabelProbabilityInjector()
        return label_prob_injector(
            data,
            from_index=from_index,
            to_index=to_index,
            target_col=target_col,
            class_probabilities=self._dirichlet_probabilities,
        )


# endregion