Source code for menelaus.datasets.make_example_data

""" Functions to generate example data according to a fixed scheme. """

import os
import pandas as pd
import numpy as np


[docs]def make_example_batch_data(): """ This function returns a dataframe containing synthetic batch data for use with the repo's examples. The dataframe's columns are ``"year", "a", "b", ... "j", "cat", "confidence", "drift"``. * ``year`` covers 2007-2021, with 20,000 observations each. * Features ``"b", "e", "f"`` are normally distributed. * Features ``"a", "c", "d", "g", "h", "i", "j"`` have a gamma distribution. * The ``"cat"`` feature contains categorical variables ranging from 1-7, sampled with varying probability. * ``"confidence"`` contains values uniform on ``[0, 0.6]`` through 2018, then values uniform on ``[0.4, 1]``. Drift occurs as follows: * Change the mean of column ``"b"`` in 2009. Reverts to original distribution in 2010. * Change the variance of columns ``"c"`` and ``"d"`` in 2012 by replacing some samples with the mean. Reverts to original distribution in 2013. * Increase the correlation of columns ``"e"`` and ``"f"`` in 2015 (0 correlation to 0.5 correlation). * Change the range of the uniform distribution for ``confidence`` from ``[0, 0.6]`` to ``[0.4, 1]`` in 2019 and forward. * Change the mean and variance of column ``"h"`` in 2019, and maintain this new distribution going forward. Change the range of the "confidence" column going forward. * Change the mean and variance of column ``"j"`` in 2021. Returns: pd.DataFrame: A dataframe containing a synthetic batch dataset. """ np.random.seed(123) year_size = 20000 df = pd.DataFrame() df["year"] = year_size * list(range(2007, 2022)) df.sort_values(by="year", inplace=True) df.reset_index(inplace=True) sample_size = df.shape[0] df["a"] = np.random.gamma(shape=8, size=sample_size) * 1000 df["b"] = np.random.normal(loc=200, scale=10, size=sample_size) df["c"] = np.random.gamma(shape=7, size=sample_size) * 1000 df["d"] = np.random.gamma(shape=10, size=sample_size) * 10000 df[["e", "f"]] = np.random.multivariate_normal( mean=(0, 0), cov=np.array([[2, 0], [0, 2]]), size=sample_size ) df["g"] = np.random.gamma(shape=11, size=sample_size) * 10000 df["h"] = np.random.gamma(shape=12, size=sample_size) * 1000 df["i"] = np.random.gamma(shape=9, size=sample_size) * 1000 df["j"] = np.random.gamma(shape=10, size=sample_size) * 100 df["cat"] = np.random.choice( range(7), size=sample_size, p=(0.3, 0.3, 0.2, 0.1, 0.05, 0.04, 0.01) ) df["confidence"] = np.random.uniform(low=0, high=0.6, size=sample_size) ###################################################################### # Drift 1: change the mean of B in 2009, means will revert for 2010 on df.loc[df.year == 2009, "b"] = np.random.normal(size=year_size, loc=500, scale=10) ###################################################################### # Drift 2: change the variance of c and d in 2012 by replacing some with the mean # keep same mean as other years, revert by 2013 mu_c = df["c"].mean() mu_d = df["d"].mean() # subtle change, every 10 obs df.loc[(df.year == 2012) & (df.index % 10 == 0), "c"] = mu_c + np.random.normal( loc=0, scale=10, size=year_size // 10 ) # bigger change, every other obs df.loc[(df.year == 2012) & (df.index % 2 == 0), "d"] = mu_d + np.random.normal( loc=0, scale=10, size=year_size // 2 ) ###################################################################### # Drift 3: change the correlation of e and f in 2015 (go from correlation of 0 to correlation of 0.5) df.loc[df.year == 2015, ["e", "f"]] = np.random.multivariate_normal( mean=(0, 0), cov=np.array([[2, 1], [1, 2]]), size=year_size ) ###################################################################### # Drift 4: change mean and var of H and persist it from 2018 on, change range of confidence scores df.loc[df.year > 2018, "h"] = ( np.random.gamma(shape=1, scale=1, size=3 * year_size) * 1000 ) df.loc[df.year > 2018, "confidence"] = np.random.uniform( low=0.4, high=1, size=3 * year_size ) ###################################################################### # Drift 5: change mean and var just for a year of J in 2021 df.loc[df.year == 2021, "j"] = np.random.gamma(shape=10, size=year_size) * 10 df["drift"] = df["year"].isin([2009, 2012, 2015, 2018, 2021]) df.drop("index", axis=1, inplace=True) return df
[docs]def fetch_circle_data(): """Retrieve the Circle data from the datasets directory. Circle is synthetic data containing drift due to both a change in the feature distribution and a change in the conditional target distribution. Drift occurs from index 1000-1250 and affects 66% of the data points. Ref. :cite:t:`minku2010` Returns: pd.DataFrame: A dataframe containing the Circle dataset. """ data_path = os.path.join(os.path.dirname(__file__), "dataCircleGSev3Sp3Train.csv") return pd.read_csv(data_path, usecols=[0, 1, 2], names=["var1", "var2", "y"])
[docs]def fetch_rainfall_data(): """Retrieve the Rainfall data from the datasets directory. National Oceanic and Atmospheric Administration (NOAA) rainfall data contains weather measurements collected over a 50 year period at a site location in Bellevue, Nebraska. It contains eight features: temperature, dew point, sea-level pressure, visibility, average wind speed, max sustained wind-speed, minimum temperature, and maximum temperature. The dependent variable is rain. Concept and data drift starts in index 12,000 and persists through the rest of the dataset. Ref. :cite:t:`souza2020` Returns: pd.DataFrame: A dataframe containing the Rainfall dataset. """ data_path = os.path.join(os.path.dirname(__file__), "rainfall_data.csv") df = pd.read_csv( data_path, usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9], names=[ "index", "temperature", "dew_point", "sea_level_pressure", "visibility", "average_wind_speed", "max_sustained_wind_speed", "minimum_temperature", "maximum_temperature", "rain", ], ) df = df.iloc[1:, :].reset_index(drop=True) df = df.apply(pd.to_numeric) return df