Source code for menelaus.injection.injector

from abc import ABC, abstractclassmethod

import numpy as np
import pandas as pd


[docs]class Injector(ABC):
    """
    Abstract base class for drift injection callables. Classes using this
    pattern should implement a ``__call__`` function, and also make use of the
    ``_preprocess()`` and ``_postprocess`` type handling capabilities of this
    class.

    When called, an ``Injector`` typically accepts either ``pandas.DataFrame``
    or ``numpy.ndarray`` data, along with other parameters for the injection
    technique it is modeling, and returns the data with drift injected over a
    specified window.
    """

    def _preprocess(self, data, *columns, return_df=False):
        """
        Preprocesses data and any column indicators into desired format. By
        default returns data as ``numpy.ndarray`` and any columns as integer
        column indices unless told otherwise.

        Note:
        * ``*columns`` is used to preprocess any columns used by a sub-class'
        ``__call__`` function, and does not necessarily represent ALL columns
        * if the input is in ``numpy`` format, and a ``numpy.ndarray`` is
        desired, ``*columns`` should be integers
        * ``*columns`` should only be string names if the input is a
        ``pandas.DataFrame``

        Args:
            data (numpy.ndarray or pd.DataFrame): data to preprocess
            *columns (str or int): 1 or more integers or names corresponding to
                columns in the data, to also be preprocessed
            return_df (bool): Whether to return data as ``pandas.DataFrame``,
                in which case ``*columns`` are returned as-is. Default False.

        Returns:
            tuple: first item is a deep-copy of the data in ``numpy.ndarray``,
                unless ``return_df`` is True, in which case a
                ``pandas.DataFrame`` is returned. Second item is the column
                indicators as integer indexes, unless ``return_df`` is True, in
                which case the column indicators should be strings and are
                returned as-is.
        """
        # store columns if DF, otherwise set to None
        if isinstance(data, np.ndarray):
            self._columns = None
            column_idxs = columns

        # column str names to integer idxs if needed
        elif isinstance(data, pd.DataFrame):
            self._columns = data.columns
            column_idxs = tuple([data.columns.get_loc(c) for c in columns])

        # only two types supported
        else:
            raise ValueError(f"Data of type {type(data)} not supported")

        # copy and return desired type
        copy = np.copy(data)
        if return_df:
            return pd.DataFrame(copy, columns=self._columns), columns
        else:
            return copy, column_idxs

    def _postprocess(self, data):
        """
        Postprocesses data and returns as its initial type, which was
        determined and recorded during preprocessing.

        Args:
            data (``numpy.ndarray`` or ``pandas.DataFrame``): data to
                postprocess

        Returns:
            data (``numpy.ndarray`` ``or pandas.DataFrame``): data,
                transformed into its original state
        """
        # if column names were stored and data is not already DF, return DF
        if self._columns is not None and not isinstance(data, pd.DataFrame):
            return pd.DataFrame(data, columns=self._columns)

        # if column names not stored and data is DF, return original ndarray
        elif self._columns is None and isinstance(data, pd.DataFrame):
            return data.to_numpy()

        # if column names stored and data is DF, return unchanged
        # if column names not stored and data is ndarray, return unchanged
        else:
            return data

    @abstractclassmethod
    def __call__(self):
        """Implemented by sub-classes."""
        raise NotImplementedError