Source code for menelaus.injection.injector

from abc import ABC, abstractclassmethod

import numpy as np
import pandas as pd


[docs]class Injector(ABC): """ Abstract base class for drift injection callables. Classes using this pattern should implement a ``__call__`` function, and also make use of the ``_preprocess()`` and ``_postprocess`` type handling capabilities of this class. When called, an ``Injector`` typically accepts either ``pandas.DataFrame`` or ``numpy.ndarray`` data, along with other parameters for the injection technique it is modeling, and returns the data with drift injected over a specified window. """ def _preprocess(self, data, *columns, return_df=False): """ Preprocesses data and any column indicators into desired format. By default returns data as ``numpy.ndarray`` and any columns as integer column indices unless told otherwise. Note: * ``*columns`` is used to preprocess any columns used by a sub-class' ``__call__`` function, and does not necessarily represent ALL columns * if the input is in ``numpy`` format, and a ``numpy.ndarray`` is desired, ``*columns`` should be integers * ``*columns`` should only be string names if the input is a ``pandas.DataFrame`` Args: data (numpy.ndarray or pd.DataFrame): data to preprocess *columns (str or int): 1 or more integers or names corresponding to columns in the data, to also be preprocessed return_df (bool): Whether to return data as ``pandas.DataFrame``, in which case ``*columns`` are returned as-is. Default False. Returns: tuple: first item is a deep-copy of the data in ``numpy.ndarray``, unless ``return_df`` is True, in which case a ``pandas.DataFrame`` is returned. Second item is the column indicators as integer indexes, unless ``return_df`` is True, in which case the column indicators should be strings and are returned as-is. """ # store columns if DF, otherwise set to None if isinstance(data, np.ndarray): self._columns = None column_idxs = columns # column str names to integer idxs if needed elif isinstance(data, pd.DataFrame): self._columns = data.columns column_idxs = tuple([data.columns.get_loc(c) for c in columns]) # only two types supported else: raise ValueError(f"Data of type {type(data)} not supported") # copy and return desired type copy = np.copy(data) if return_df: return pd.DataFrame(copy, columns=self._columns), columns else: return copy, column_idxs def _postprocess(self, data): """ Postprocesses data and returns as its initial type, which was determined and recorded during preprocessing. Args: data (``numpy.ndarray`` or ``pandas.DataFrame``): data to postprocess Returns: data (``numpy.ndarray`` ``or pandas.DataFrame``): data, transformed into its original state """ # if column names were stored and data is not already DF, return DF if self._columns is not None and not isinstance(data, pd.DataFrame): return pd.DataFrame(data, columns=self._columns) # if column names not stored and data is DF, return original ndarray elif self._columns is None and isinstance(data, pd.DataFrame): return data.to_numpy() # if column names stored and data is DF, return unchanged # if column names not stored and data is ndarray, return unchanged else: return data @abstractclassmethod def __call__(self): """Implemented by sub-classes.""" raise NotImplementedError