Skip to content

drift

DriftGenerator

Bases: GeneratorMixin

Base class for transformers that add noise to tabular data

Source code in badgers/generators/tabular_data/drift.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class DriftGenerator(GeneratorMixin):
    """
    Base class for transformers that add noise to tabular data
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        :param random_generator: numpy.random.Generator, default default_rng(seed=0)
            A random generator
        """
        self.random_generator = random_generator

    @abc.abstractmethod
    def generate(self, X, y, **params):
        pass

__init__(random_generator=default_rng(seed=0))

:param random_generator: numpy.random.Generator, default default_rng(seed=0) A random generator

Source code in badgers/generators/tabular_data/drift.py
18
19
20
21
22
23
def __init__(self, random_generator=default_rng(seed=0)):
    """
    :param random_generator: numpy.random.Generator, default default_rng(seed=0)
        A random generator
    """
    self.random_generator = random_generator

RandomShiftClassesGenerator

Bases: DriftGenerator

Randomly shift (geometrical translation) values of each class independently of one another. Data are first standardized (mean = 0, var = 1) and for each class a random number is added to all instances.

Source code in badgers/generators/tabular_data/drift.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
class RandomShiftClassesGenerator(DriftGenerator):
    """
    Randomly shift (geometrical translation) values of each class independently of one another.
    Data are first standardized (mean = 0, var = 1) and
    for each class a random number is added to all instances.
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        :param random_generator: A random generator
        """
        super().__init__(random_generator=random_generator)

    @preprocess_inputs
    def generate(self, X, y, shift_std: Union[float,np.array] = 0.1):
        """
        Randomly shift (geometrical translation) values of each class independently of one another.
        Data are first standardized (mean = 0, var = 1) and
        for each class a random number is added to all instances.

        :param X:
        :param y:
        :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
        """
        # extract unique labels
        classes = np.unique(y)
        # normalize X
        scaler = StandardScaler()
        scaler.fit(X)
        Xt = scaler.transform(X)
        # generate random values for the shift
        shifts = self.random_generator.normal(loc=0, scale=shift_std, size=len(classes))
        # add shift
        for c, s in zip(classes, shifts):
            Xt[y == c] += s
        # inverse transform
        return pd.DataFrame(data=scaler.inverse_transform(Xt), columns=X.columns, index=X.index), y

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/drift.py
78
79
80
81
82
def __init__(self, random_generator=default_rng(seed=0)):
    """
    :param random_generator: A random generator
    """
    super().__init__(random_generator=random_generator)

generate(X, y, shift_std=0.1)

Randomly shift (geometrical translation) values of each class independently of one another. Data are first standardized (mean = 0, var = 1) and for each class a random number is added to all instances.

:param X: :param y: :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)

Source code in badgers/generators/tabular_data/drift.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@preprocess_inputs
def generate(self, X, y, shift_std: Union[float,np.array] = 0.1):
    """
    Randomly shift (geometrical translation) values of each class independently of one another.
    Data are first standardized (mean = 0, var = 1) and
    for each class a random number is added to all instances.

    :param X:
    :param y:
    :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
    """
    # extract unique labels
    classes = np.unique(y)
    # normalize X
    scaler = StandardScaler()
    scaler.fit(X)
    Xt = scaler.transform(X)
    # generate random values for the shift
    shifts = self.random_generator.normal(loc=0, scale=shift_std, size=len(classes))
    # add shift
    for c, s in zip(classes, shifts):
        Xt[y == c] += s
    # inverse transform
    return pd.DataFrame(data=scaler.inverse_transform(Xt), columns=X.columns, index=X.index), y

RandomShiftGenerator

Bases: DriftGenerator

Randomly shift (geometrical translation) values of each column independently of one another. Data are first standardized (mean = 0, var = 1) and a random number is added to each column. The ith columns is simply translated: $x_i \left arrow x_i + \epsilon_i$

Source code in badgers/generators/tabular_data/drift.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class RandomShiftGenerator(DriftGenerator):
    """
    Randomly shift (geometrical translation) values of each column independently of one another.
    Data are first standardized (mean = 0, var = 1) and a random number is added to each column.
    The ith columns is simply translated: `$x_i \left arrow x_i + \epsilon_i$`
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """

        :param random_generator: A random generator
        :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
        """
        super().__init__(random_generator=random_generator)

    @preprocess_inputs
    def generate(self, X, y=None, shift_std: Union[float,np.array] = 0.1):
        """
        Randomly shift (geometrical translation) values of each column independently of one another.
        Data are first standardized (mean = 0, var = 1) and a random number is added to each column.
        The ith columns is simply translated: `$x_i \left arrow x_i + \epsilon_i$`


        :param X:
        :param y:
        :param shift_std:
        :return:
        """
        # normalize X
        scaler = StandardScaler()
        scaler.fit(X)
        Xt = scaler.transform(X)
        # generate random values for the shift for each column
        shift = self.random_generator.normal(loc=0, scale=shift_std, size=X.shape[1])
        # add shift
        Xt += shift
        # inverse transform
        return pd.DataFrame(data=scaler.inverse_transform(Xt), columns=X.columns, index=X.index), y

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)

Source code in badgers/generators/tabular_data/drift.py
38
39
40
41
42
43
44
def __init__(self, random_generator=default_rng(seed=0)):
    """

    :param random_generator: A random generator
    :param shift_std: The standard deviation of the amount of shift applied (shift is chosen from a normal distribution)
    """
    super().__init__(random_generator=random_generator)

generate(X, y=None, shift_std=0.1)

Randomly shift (geometrical translation) values of each column independently of one another. Data are first standardized (mean = 0, var = 1) and a random number is added to each column. The ith columns is simply translated: $x_i \left arrow x_i + \epsilon_i$

:param X: :param y: :param shift_std: :return:

Source code in badgers/generators/tabular_data/drift.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
@preprocess_inputs
def generate(self, X, y=None, shift_std: Union[float,np.array] = 0.1):
    """
    Randomly shift (geometrical translation) values of each column independently of one another.
    Data are first standardized (mean = 0, var = 1) and a random number is added to each column.
    The ith columns is simply translated: `$x_i \left arrow x_i + \epsilon_i$`


    :param X:
    :param y:
    :param shift_std:
    :return:
    """
    # normalize X
    scaler = StandardScaler()
    scaler.fit(X)
    Xt = scaler.transform(X)
    # generate random values for the shift for each column
    shift = self.random_generator.normal(loc=0, scale=shift_std, size=X.shape[1])
    # add shift
    Xt += shift
    # inverse transform
    return pd.DataFrame(data=scaler.inverse_transform(Xt), columns=X.columns, index=X.index), y