Source code for menelaus.concept_drift.adwin_accuracy

from menelaus.change_detection.adwin import ADWIN


[docs]class ADWINAccuracy(ADWIN):
    """ADWIN (ADaptive WINdowing) is a change detection algorithm which uses a
    sliding window to estimate the running mean and variance of a given
    real-valued number. It can be applied as a concept drift detector by
    monitoring a performance metric for a given classifier. ADWINAccuracy
    specifically expects ``y_true``, ``y_pred``, and uses that input to monitor
    the running accuracy of a classifier. To use ADWIN to monitor other values,
    see ``change_detection.ADWIN``.

    As each sample is added, ADWIN stores a running estimate (mean and variance)
    for a given statistic, calculated over a sliding window which will grow to
    the right until drift is detected. The condition for drift is defined over
    pairs of subwindows at certain cutpoints within the current window. If, for
    any such pair, the difference between the running estimates of the statistic
    is over a certain threshold (controlled by delta), we identify drift, and
    remove the oldest elements of the window until all differences are again
    below the threshold.

    The running estimates in each subwindow are maintained by storing summaries
    of the elements in "buckets," which, in this implementation, are themselves
    stored in the ``bucket_row_list`` attribute, whose total size scales with the
    ``max_buckets`` parameter.

    When drift occurs, the index of the element at the beginning of ADWIN's new
    window is stored in ``self.retraining_recs``.

    Ref. :cite:t:`bifet2007learning`
    """

[docs]    def __init__(
        self,
        delta=0.002,
        max_buckets=5,
        new_sample_thresh=32,
        window_size_thresh=10,
        subwindow_size_thresh=5,
        conservative_bound=False,
    ):
        """

        Args:
            delta (float, optional): confidence value on on 0 to 1. ADWIN will
                incorrectly detect drift with at most probability ``delta``, and
                correctly detect drift with at least probability ``1 - delta``.
                Defaults to 0.002.
            max_buckets (int, optional): the maximum number of buckets to
                maintain in each BucketRow. Corresponds to the "M" parameter in
                Bifet 2006. Defaults to 5.
            new_sample_thresh (int, optional): the drift detection procedure
                will run every ``new_sample_thresh samples``, not in between.
                Defaults to 32.
            window_size_thresh (int, optional): the minimum number of samples in
                the window required to check for drift. Defaults to 10.
            subwindow_size_thresh (int, optional): the minimum number of samples
                in each subwindow reqired to check it for drift. Defaults to 5.
            conservative_bound (bool, optional): whether to assume a 'large
                enough' sample when constructing drift cutoff. Defaults to ``False``.

        Raises:
            ValueError: If ``ADWIN.delta`` is not on the range 0 to 1.
        """
        # TODO - um, shouldn't this use the init parameters
        super().__init__(
            delta=0.002,
            max_buckets=5,
            new_sample_thresh=32,
            window_size_thresh=10,
            subwindow_size_thresh=5,
            conservative_bound=False,
        )

[docs]    def update(self, y_true, y_pred, X=None):
        """Update the detector with a new sample.

        Args:
            y_true: one true label from input data.
            y_pred: one predicted label from input data.
            X: next sample in the stream of data. Not used for this
                accuracy-based ADWIN. See ``change_detection.ADWIN`` for that
                application.
        """

        # This class is here to avoid asking the user to provide such a direct
        # function of (y_true, y_pred) in the X argument, which is unintuitive.
        _, y_true, y_pred = super()._validate_input(None, y_true, y_pred)
        new_value = int(y_true == y_pred)
        # the arrays should have a single element after validation.
        y_true, y_pred = y_true[0], y_pred[0]
        super().update(new_value, y_true=None, y_pred=None)