Skip to content

missingness

DummyMissingAtRandom

Bases: MissingValueGenerator

A generator that removes values at random (MAR [1]), where the probability of a data instance X[,i] missing depends upon another feature X[,j], where j is randomly chosen.

See also [1] https://stefvanbuuren.name/fimd/sec-MCAR.html

Source code in badgers/generators/tabular_data/missingness.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class DummyMissingAtRandom(MissingValueGenerator):
    """
    A generator that removes values at random (MAR [1]),
    where the probability of a data instance X[_,i] missing depends upon another feature X[_,j],
    where j is randomly chosen.

    See also [1] https://stefvanbuuren.name/fimd/sec-MCAR.html
    """

    def __init__(self, percentage_missing: int = 10, random_generator=default_rng(seed=0)):
        """

        :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
        :param random_generator: A random generator
        """
        super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

    @numpy_API
    def generate(self, X, y, **params):
        """

        :param X: the input features
        :param y: the target
        :return: Xt, yt
        """
        # initialize probability with zeros
        p = np.zeros_like(X)
        # normalize values between 0 and 1
        X_norm = (X.max(axis=0) - X) / (X.max(axis=0) - X.min(axis=0))
        # make columns i depends on all the other
        if X.shape[1] > 1:
            for i in range(X.shape[1]):
                j = self.random_generator.choice([x for x in range(X.shape[1]) if x != i])
                p[:, i] = X_norm[:, j]
        else:
            p = X_norm
        p = normalize_proba(p)

        # compute number of missing values per column
        nb_missing = int(X.shape[0] * self.percentage_missing / 100)
        # generate missing values indices
        self.missing_values_indices_ = []
        for col in range(X.shape[1]):
            rows = self.random_generator.choice(X.shape[0], size=nb_missing, replace=False, p=p[:, col])
            self.missing_values_indices_ += [(row, col) for row in rows]
            # generate missing values
            X[rows, col] = np.nan

        return X, y

__init__(percentage_missing=10, random_generator=default_rng(seed=0))

:param percentage_missing: The percentage of missing values (int value between 0 and 100 included) :param random_generator: A random generator

Source code in badgers/generators/tabular_data/missingness.py
80
81
82
83
84
85
86
def __init__(self, percentage_missing: int = 10, random_generator=default_rng(seed=0)):
    """

    :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
    :param random_generator: A random generator
    """
    super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

generate(X, y, **params)

:param X: the input features :param y: the target :return: Xt, yt

Source code in badgers/generators/tabular_data/missingness.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
@numpy_API
def generate(self, X, y, **params):
    """

    :param X: the input features
    :param y: the target
    :return: Xt, yt
    """
    # initialize probability with zeros
    p = np.zeros_like(X)
    # normalize values between 0 and 1
    X_norm = (X.max(axis=0) - X) / (X.max(axis=0) - X.min(axis=0))
    # make columns i depends on all the other
    if X.shape[1] > 1:
        for i in range(X.shape[1]):
            j = self.random_generator.choice([x for x in range(X.shape[1]) if x != i])
            p[:, i] = X_norm[:, j]
    else:
        p = X_norm
    p = normalize_proba(p)

    # compute number of missing values per column
    nb_missing = int(X.shape[0] * self.percentage_missing / 100)
    # generate missing values indices
    self.missing_values_indices_ = []
    for col in range(X.shape[1]):
        rows = self.random_generator.choice(X.shape[0], size=nb_missing, replace=False, p=p[:, col])
        self.missing_values_indices_ += [(row, col) for row in rows]
        # generate missing values
        X[rows, col] = np.nan

    return X, y

DummyMissingNotAtRandom

Bases: MissingValueGenerator

A generator that removes values not at random (MNAR [1]), where the probability of a data instance X[i,j] missing depends linearly upon its own value. A data point X[i,j] = max(X[:,j]) has a missing probability of 1. A data point X[i,j] = min(X[:,j]) has a missing probability of 0.

Source code in badgers/generators/tabular_data/missingness.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
class DummyMissingNotAtRandom(MissingValueGenerator):
    """
    A generator that removes values not at random (MNAR [1]),
    where the probability of a data instance X[i,j] missing depends linearly upon its own value.
    A data point X[i,j] = max(X[:,j]) has a missing probability of 1.
    A data point X[i,j] = min(X[:,j]) has a missing probability of 0.
    """

    def __init__(self, percentage_missing: int = 10, random_generator=default_rng(seed=0)):
        """

        :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
        :param random_generator: A random generator
        """
        super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

    @numpy_API
    def generate(self, X, y, **params):
        """

        :param X: the input features
        :param y: the target
        :return: Xt, yt
        """

        # normalize values between 0 and 1
        p = (X.max(axis=0) - X) / (X.max(axis=0) - X.min(axis=0))
        # make the sum of each column = 1
        p = normalize_proba(p)

        # compute number of missing values per column
        nb_missing = int(X.shape[0] * self.percentage_missing / 100)
        # generate missing values indices
        self.missing_values_indices_ = []
        for col in range(X.shape[1]):
            rows = self.random_generator.choice(X.shape[0], size=nb_missing, replace=False, p=p[:, col])
            self.missing_values_indices_ += [(row, col) for row in rows]
            # generate missing values
            X[rows, col] = np.nan

        return X, y

__init__(percentage_missing=10, random_generator=default_rng(seed=0))

:param percentage_missing: The percentage of missing values (int value between 0 and 100 included) :param random_generator: A random generator

Source code in badgers/generators/tabular_data/missingness.py
130
131
132
133
134
135
136
def __init__(self, percentage_missing: int = 10, random_generator=default_rng(seed=0)):
    """

    :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
    :param random_generator: A random generator
    """
    super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

generate(X, y, **params)

:param X: the input features :param y: the target :return: Xt, yt

Source code in badgers/generators/tabular_data/missingness.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
@numpy_API
def generate(self, X, y, **params):
    """

    :param X: the input features
    :param y: the target
    :return: Xt, yt
    """

    # normalize values between 0 and 1
    p = (X.max(axis=0) - X) / (X.max(axis=0) - X.min(axis=0))
    # make the sum of each column = 1
    p = normalize_proba(p)

    # compute number of missing values per column
    nb_missing = int(X.shape[0] * self.percentage_missing / 100)
    # generate missing values indices
    self.missing_values_indices_ = []
    for col in range(X.shape[1]):
        rows = self.random_generator.choice(X.shape[0], size=nb_missing, replace=False, p=p[:, col])
        self.missing_values_indices_ += [(row, col) for row in rows]
        # generate missing values
        X[rows, col] = np.nan

    return X, y

MissingCompletelyAtRandom

Bases: MissingValueGenerator

A generator that removes values completely at random (MCAR [1]) (uniform distribution over all data).

See also [1] https://stefvanbuuren.name/fimd/sec-MCAR.html

Source code in badgers/generators/tabular_data/missingness.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class MissingCompletelyAtRandom(MissingValueGenerator):
    """
    A generator that removes values completely at random (MCAR [1]) (uniform distribution over all data).

    See also [1] https://stefvanbuuren.name/fimd/sec-MCAR.html
    """

    def __init__(self, percentage_missing: int = 10, random_generator=default_rng(seed=0)):
        """

        :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
        :param random_generator: A random generator
        """
        super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

    @numpy_API
    def generate(self, X, y, **params):
        """
        Computes indices of missing values using a uniform distribution.

        :param X: the input features
        :param y: the target
        :return: Xt, yt
        """
        # compute number of missing values per column
        nb_missing = int(X.shape[0] * self.percentage_missing / 100)
        # generate missing values indices
        self.missing_values_indices_ = []
        for col in range(X.shape[1]):
            rows = self.random_generator.choice(X.shape[0], size=nb_missing, replace=False, p=None)
            self.missing_values_indices_ += [(row, col) for row in rows]
            # generate missing values
            X[rows, col] = np.nan

        return X, y

__init__(percentage_missing=10, random_generator=default_rng(seed=0))

:param percentage_missing: The percentage of missing values (int value between 0 and 100 included) :param random_generator: A random generator

Source code in badgers/generators/tabular_data/missingness.py
41
42
43
44
45
46
47
def __init__(self, percentage_missing: int = 10, random_generator=default_rng(seed=0)):
    """

    :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
    :param random_generator: A random generator
    """
    super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

generate(X, y, **params)

Computes indices of missing values using a uniform distribution.

:param X: the input features :param y: the target :return: Xt, yt

Source code in badgers/generators/tabular_data/missingness.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
@numpy_API
def generate(self, X, y, **params):
    """
    Computes indices of missing values using a uniform distribution.

    :param X: the input features
    :param y: the target
    :return: Xt, yt
    """
    # compute number of missing values per column
    nb_missing = int(X.shape[0] * self.percentage_missing / 100)
    # generate missing values indices
    self.missing_values_indices_ = []
    for col in range(X.shape[1]):
        rows = self.random_generator.choice(X.shape[0], size=nb_missing, replace=False, p=None)
        self.missing_values_indices_ += [(row, col) for row in rows]
        # generate missing values
        X[rows, col] = np.nan

    return X, y

MissingValueGenerator

Bases: GeneratorMixin

Base class for missing values transformer

Source code in badgers/generators/tabular_data/missingness.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
class MissingValueGenerator(GeneratorMixin):
    """
    Base class for missing values transformer
    """

    def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.Generator = default_rng(seed=0)):
        """

        :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
        :param random_generator: A random generator
        """
        assert 0 <= percentage_missing <= 100
        self.percentage_missing = percentage_missing
        self.random_generator = random_generator
        self.missing_values_indices_ = None

    @abc.abstractmethod
    def generate(self, X, y, **params):
        pass

__init__(percentage_missing=10, random_generator=default_rng(seed=0))

:param percentage_missing: The percentage of missing values (int value between 0 and 100 included) :param random_generator: A random generator

Source code in badgers/generators/tabular_data/missingness.py
18
19
20
21
22
23
24
25
26
27
def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.Generator = default_rng(seed=0)):
    """

    :param percentage_missing: The percentage of missing values (int value between 0 and 100 included)
    :param random_generator: A random generator
    """
    assert 0 <= percentage_missing <= 100
    self.percentage_missing = percentage_missing
    self.random_generator = random_generator
    self.missing_values_indices_ = None