Skip to content

outliers

DecompositionAndOutlierGenerator

Bases: OutliersGenerator

Source code in badgers/generators/tabular_data/outliers.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
class DecompositionAndOutlierGenerator(OutliersGenerator):

    def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin = PCA(n_components=2),
                 outlier_generator: OutliersGenerator = ZScoreSamplingGenerator(default_rng(0))):
        """

        :param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer
        :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced
        """
        assert hasattr(
            decomposition_transformer,
            'inverse_transform'), \
            f'the decomposition transformer class must implement the inverse_transform function.' \
            f'\nUnfortunately the class {decomposition_transformer} does not'
        super().__init__(random_generator=outlier_generator.random_generator)

        self.decomposition_transformer = decomposition_transformer
        self.outlier_generator = outlier_generator

    @preprocess_inputs
    def generate(self, X, y=None, **params):
        """
        Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
        and an outlier transformer.

        1. Standardize the input data (mean = 0, variance = 1)
        2. Apply the dimensionality reduction transformer
        3. Generates outliers by applying the outlier transformer
        4. Inverse the dimensionality reduction and the standardization transformations

        :param X: the input features
        :param y: the regression target, class labels, or None
        :param params:
        :return:
        """

        # standardize the data and apply the dimensionality reduction transformer
        pipeline = make_pipeline(
            StandardScaler(),
            self.decomposition_transformer,
        )
        Xt = pipeline.fit_transform(X)
        # add outliers using the zscore_transformer
        Xt, yt = self.outlier_generator.generate(Xt, y, **params)
        # inverse the manifold and standardization transformations
        return pipeline.inverse_transform(Xt), yt

__init__(decomposition_transformer=PCA(n_components=2), outlier_generator=ZScoreSamplingGenerator(default_rng(0)))

:param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced

Source code in badgers/generators/tabular_data/outliers.py
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin = PCA(n_components=2),
             outlier_generator: OutliersGenerator = ZScoreSamplingGenerator(default_rng(0))):
    """

    :param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer
    :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced
    """
    assert hasattr(
        decomposition_transformer,
        'inverse_transform'), \
        f'the decomposition transformer class must implement the inverse_transform function.' \
        f'\nUnfortunately the class {decomposition_transformer} does not'
    super().__init__(random_generator=outlier_generator.random_generator)

    self.decomposition_transformer = decomposition_transformer
    self.outlier_generator = outlier_generator

generate(X, y=None, **params)

Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition) and an outlier transformer.

  1. Standardize the input data (mean = 0, variance = 1)
  2. Apply the dimensionality reduction transformer
  3. Generates outliers by applying the outlier transformer
  4. Inverse the dimensionality reduction and the standardization transformations

:param X: the input features :param y: the regression target, class labels, or None :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
@preprocess_inputs
def generate(self, X, y=None, **params):
    """
    Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
    and an outlier transformer.

    1. Standardize the input data (mean = 0, variance = 1)
    2. Apply the dimensionality reduction transformer
    3. Generates outliers by applying the outlier transformer
    4. Inverse the dimensionality reduction and the standardization transformations

    :param X: the input features
    :param y: the regression target, class labels, or None
    :param params:
    :return:
    """

    # standardize the data and apply the dimensionality reduction transformer
    pipeline = make_pipeline(
        StandardScaler(),
        self.decomposition_transformer,
    )
    Xt = pipeline.fit_transform(X)
    # add outliers using the zscore_transformer
    Xt, yt = self.outlier_generator.generate(Xt, y, **params)
    # inverse the manifold and standardization transformations
    return pipeline.inverse_transform(Xt), yt

HistogramSamplingGenerator

Bases: OutliersGenerator

Randomly generates outliers from low density regions. Low density regions are estimated through a histogram.


WARNING: This computes a full histogram in d-dimensions (d = nb features / columns), which is O(d²). Should only be used with low dimensionality data! It will raise an error if the number of dimensions is greater than 5.


TODO: this works but is very inefficient, better strategies are welcome!

Source code in badgers/generators/tabular_data/outliers.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
class HistogramSamplingGenerator(OutliersGenerator):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated through a histogram.

    -----------------------------------------
    WARNING:
    This computes a full histogram in d-dimensions (d = nb features / columns), which is O(d²).
    Should only be used with low dimensionality data!
    It will raise an error if the number of dimensions is greater than 5.
    -----------------------------------------

    TODO: this works but is very inefficient, better strategies are welcome!
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """

        :param random_generator: A random generator

        """

        super().__init__(random_generator)


    @preprocess_inputs
    def generate(self, X, y=None, n_outliers: int = 10,
                 threshold_low_density: float = 0.1, bins: int = 10):
        """
        Randomly generates outliers from low density regions. Low density regions are estimated through histograms

        1. Standardize the input data (mean = 0, variance = 1)
        2. Compute and normalize histogram for the data
        3. Sample datapoint uniformly at random within bins of low density
        4. Inverse the standardization transformation

        :param X: the input features
        :param y: not used
        :param n_outliers: The number of outliers to generate
        :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
        :param bins: number of bins for the histogram
        :return:
        """
        assert 0 < threshold_low_density < 1
        if X.shape[1] > 5:
            raise NotImplementedError('So far this generator only supports tabular data with at most 5 columns')
        # standardize X
        scaler = StandardScaler()
        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)

        # compute the histogram of the data
        hist, edges = np.histogramdd(Xt, density=False, bins=bins)
        # normalize
        norm_hist = hist / (np.max(hist) - np.min(hist))
        # get coordinates of the histogram where the density is low (below a certain threshold)
        hist_coords_low_density = np.where(norm_hist <= threshold_low_density)
        # randomly pick some coordinates in the histogram where the density is low
        hist_coords_random = self.random_generator.choice(list(zip(*hist_coords_low_density)), n_outliers,
                                                          replace=True)

        # computing outliers values
        outliers = np.array([
            [
                self.random_generator.uniform(low=edges[i][c], high=edges[i][c + 1])
                for i, c in enumerate(h_coords)
            ]
            for h_coords in hist_coords_random
        ])

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/outliers.py
215
216
217
218
219
220
221
222
def __init__(self, random_generator=default_rng(seed=0)):
    """

    :param random_generator: A random generator

    """

    super().__init__(random_generator)

generate(X, y=None, n_outliers=10, threshold_low_density=0.1, bins=10)

Randomly generates outliers from low density regions. Low density regions are estimated through histograms

  1. Standardize the input data (mean = 0, variance = 1)
  2. Compute and normalize histogram for the data
  3. Sample datapoint uniformly at random within bins of low density
  4. Inverse the standardization transformation

:param X: the input features :param y: not used :param n_outliers: The number of outliers to generate :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1) :param bins: number of bins for the histogram :return:

Source code in badgers/generators/tabular_data/outliers.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
@preprocess_inputs
def generate(self, X, y=None, n_outliers: int = 10,
             threshold_low_density: float = 0.1, bins: int = 10):
    """
    Randomly generates outliers from low density regions. Low density regions are estimated through histograms

    1. Standardize the input data (mean = 0, variance = 1)
    2. Compute and normalize histogram for the data
    3. Sample datapoint uniformly at random within bins of low density
    4. Inverse the standardization transformation

    :param X: the input features
    :param y: not used
    :param n_outliers: The number of outliers to generate
    :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
    :param bins: number of bins for the histogram
    :return:
    """
    assert 0 < threshold_low_density < 1
    if X.shape[1] > 5:
        raise NotImplementedError('So far this generator only supports tabular data with at most 5 columns')
    # standardize X
    scaler = StandardScaler()
    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)

    # compute the histogram of the data
    hist, edges = np.histogramdd(Xt, density=False, bins=bins)
    # normalize
    norm_hist = hist / (np.max(hist) - np.min(hist))
    # get coordinates of the histogram where the density is low (below a certain threshold)
    hist_coords_low_density = np.where(norm_hist <= threshold_low_density)
    # randomly pick some coordinates in the histogram where the density is low
    hist_coords_random = self.random_generator.choice(list(zip(*hist_coords_low_density)), n_outliers,
                                                      replace=True)

    # computing outliers values
    outliers = np.array([
        [
            self.random_generator.uniform(low=edges[i][c], high=edges[i][c + 1])
            for i, c in enumerate(h_coords)
        ]
        for h_coords in hist_coords_random
    ])

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return scaler.inverse_transform(outliers), yt

HypersphereSamplingGenerator

Bases: OutliersGenerator

Generates outliers by sampling points from a hypersphere with radius at least 3 sigma

Source code in badgers/generators/tabular_data/outliers.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class HypersphereSamplingGenerator(OutliersGenerator):
    """
    Generates outliers by sampling points from a hypersphere with radius at least 3 sigma
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """

        :param random_generator: A random generator

        """
        super().__init__(random_generator)

    @preprocess_inputs
    def generate(self, X, y=None, n_outliers: int = 10):
        """
        Randomly generates outliers as data points with a z-score > 3.

        1. Standardize the input data (mean = 0, variance = 1)
        3. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates):
            - angles are chosen uniformly at random
            - radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
        4. Inverse the standardization transformation

        :param X: the input features
        :param y: not used
        :param n_outliers: The number of outliers to generate
        :return:
        """

        # standardize X
        scaler = StandardScaler()

        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)

        # computing outliers
        outliers = np.array([
            random_spherical_coordinate(
                random_generator=self.random_generator,
                size=Xt.shape[1],
                radius=3. + self.random_generator.exponential()
            )
            for _ in range(n_outliers)
        ])

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/outliers.py
92
93
94
95
96
97
98
def __init__(self, random_generator=default_rng(seed=0)):
    """

    :param random_generator: A random generator

    """
    super().__init__(random_generator)

generate(X, y=None, n_outliers=10)

Randomly generates outliers as data points with a z-score > 3.

  1. Standardize the input data (mean = 0, variance = 1)
  2. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates):
    • angles are chosen uniformly at random
    • radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
  3. Inverse the standardization transformation

:param X: the input features :param y: not used :param n_outliers: The number of outliers to generate :return:

Source code in badgers/generators/tabular_data/outliers.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@preprocess_inputs
def generate(self, X, y=None, n_outliers: int = 10):
    """
    Randomly generates outliers as data points with a z-score > 3.

    1. Standardize the input data (mean = 0, variance = 1)
    3. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates):
        - angles are chosen uniformly at random
        - radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
    4. Inverse the standardization transformation

    :param X: the input features
    :param y: not used
    :param n_outliers: The number of outliers to generate
    :return:
    """

    # standardize X
    scaler = StandardScaler()

    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)

    # computing outliers
    outliers = np.array([
        random_spherical_coordinate(
            random_generator=self.random_generator,
            size=Xt.shape[1],
            radius=3. + self.random_generator.exponential()
        )
        for _ in range(n_outliers)
    ])

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return scaler.inverse_transform(outliers), yt

IndependentHistogramsGenerator

Bases: OutliersGenerator

Randomly generates outliers from low density regions. Low density regions are estimated through several independent histograms (one for each feature).

For each feature (column), a histogram is computed (it approximates the marginal distribution). Values are generated from bins with a low number of data points.

All values generated for each feature are simply concatenated (independence hypothesis!).

Source code in badgers/generators/tabular_data/outliers.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class IndependentHistogramsGenerator(OutliersGenerator):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated through several independent histograms (one for each feature).

    For each feature (column), a histogram is computed (it approximates the marginal distribution).
    Values are generated from bins with a low number of data points.

    All values generated for each feature are simply concatenated (independence hypothesis!).
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        super().__init__(random_generator=random_generator)

    @preprocess_inputs
    def generate(self, X, y=None, n_outliers: int = 10, bins: int = 10):
        """
        Randomly generates outliers from low density regions.
        Low density regions are estimated through several independent histograms (one for each feature).

        For each feature (column), a histogram is computed (it approximates the marginal distribution).
        Values are generated from bins with a low number of data points.

        All values generated for each feature are simply concatenated (independence hypothesis!).

        :param X: the input features
        :param y: not used
        :param params:
        :return:
        """
        outliers = []

        # loop over all features (columns)
        for col in range(X.shape[1]):
            # compute histogram of the current feature
            hist, bin_edges = np.histogram(X.iloc[:, col], bins=bins)
            # compute inverse density
            inv_density = 1 - hist / np.max(hist)
            # the sampling probability is proportional to the inverse density
            p = inv_density / np.sum(inv_density)
            # generate values:
            # first, choose randomly from which bin the value must be sampled
            indices = self.random_generator.choice(bins, p=p, size=n_outliers, replace=True)
            # second, sample uniformly at random from the selected bin
            values = [self.random_generator.uniform(low=bin_edges[i], high=bin_edges[i + 1]) for i in indices]
            # append the values for the current feature
            outliers.append(values)
        # cast as a numpy array
        outliers = np.array(outliers).T

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return outliers, yt

generate(X, y=None, n_outliers=10, bins=10)

Randomly generates outliers from low density regions. Low density regions are estimated through several independent histograms (one for each feature).

For each feature (column), a histogram is computed (it approximates the marginal distribution). Values are generated from bins with a low number of data points.

All values generated for each feature are simply concatenated (independence hypothesis!).

:param X: the input features :param y: not used :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
@preprocess_inputs
def generate(self, X, y=None, n_outliers: int = 10, bins: int = 10):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated through several independent histograms (one for each feature).

    For each feature (column), a histogram is computed (it approximates the marginal distribution).
    Values are generated from bins with a low number of data points.

    All values generated for each feature are simply concatenated (independence hypothesis!).

    :param X: the input features
    :param y: not used
    :param params:
    :return:
    """
    outliers = []

    # loop over all features (columns)
    for col in range(X.shape[1]):
        # compute histogram of the current feature
        hist, bin_edges = np.histogram(X.iloc[:, col], bins=bins)
        # compute inverse density
        inv_density = 1 - hist / np.max(hist)
        # the sampling probability is proportional to the inverse density
        p = inv_density / np.sum(inv_density)
        # generate values:
        # first, choose randomly from which bin the value must be sampled
        indices = self.random_generator.choice(bins, p=p, size=n_outliers, replace=True)
        # second, sample uniformly at random from the selected bin
        values = [self.random_generator.uniform(low=bin_edges[i], high=bin_edges[i + 1]) for i in indices]
        # append the values for the current feature
        outliers.append(values)
    # cast as a numpy array
    outliers = np.array(outliers).T

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return outliers, yt

LowDensitySamplingGenerator

Bases: OutliersGenerator

Randomly generates outliers from low density regions. Low density regions are estimated using a KernelDensity estimator. Points are sampled uniformly at random and filtered out if they do not belong to a low density region

TODO: this works but might not be efficient, a better sampling strategy is welcome

Source code in badgers/generators/tabular_data/outliers.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
class LowDensitySamplingGenerator(OutliersGenerator):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated using a KernelDensity estimator.
    Points are sampled uniformly at random and filtered out if they do not belong to a low density region

    TODO: this works but might not be efficient, a better sampling strategy is welcome
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """

        :param random_generator: A random generator
        """
        super().__init__(random_generator=random_generator)
        self.density_estimator = KernelDensity(bandwidth="scott")


    @preprocess_inputs
    def generate(self, X, y=None, n_outliers: int = 10, threshold_low_density: float = 0.1, max_samples: int = 100):
        """
        Generate data points belonging to low density regions.

        Pseudo code:
        - Standardize the data X
        - Estimate the density based upon the original data X
        - Computes a threshold for determining low density (so far 10th percentile)
        - Sample uniformly at random within the hypercube [min, max]
        - Estimate the density of the new points and filter out the ones with a density that is above the threshold

        :param X: the input features
        :param y: not used
        :param n_outliers: The number of outliers to generate
        :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
        :param max_samples:
        :return:
        """
        assert 0 < threshold_low_density < 1
        # standardize X
        scaler = StandardScaler()
        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)
        # fit density estimator
        self.density_estimator = self.density_estimator.fit(Xt)
        low_density_threshold = np.percentile(self.density_estimator.score_samples(Xt), threshold_low_density)

        if max_samples is None:
            max_samples = n_outliers * 100

        outliers = np.array([
            x
            for x in self.random_generator.uniform(
                low=np.min(Xt, axis=0),
                high=np.max(Xt, axis=0),
                size=(max_samples, Xt.shape[1])
            )
            if self.density_estimator.score_samples(x.reshape(1, -1)) <= low_density_threshold
        ])

        if outliers.shape[0] < n_outliers:
            warnings.warn(
                f'LowDensitySamplingGenerator could not generate all {n_outliers} outliers. It only generated {len(outliers)}.')
        else:
            outliers = outliers[:n_outliers]

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        # in the case no outliers could be generated
        if outliers.shape[0] == 0:
            return outliers, yt

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/outliers.py
290
291
292
293
294
295
296
def __init__(self, random_generator=default_rng(seed=0)):
    """

    :param random_generator: A random generator
    """
    super().__init__(random_generator=random_generator)
    self.density_estimator = KernelDensity(bandwidth="scott")

generate(X, y=None, n_outliers=10, threshold_low_density=0.1, max_samples=100)

Generate data points belonging to low density regions.

Pseudo code: - Standardize the data X - Estimate the density based upon the original data X - Computes a threshold for determining low density (so far 10th percentile) - Sample uniformly at random within the hypercube [min, max] - Estimate the density of the new points and filter out the ones with a density that is above the threshold

:param X: the input features :param y: not used :param n_outliers: The number of outliers to generate :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1) :param max_samples: :return:

Source code in badgers/generators/tabular_data/outliers.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
@preprocess_inputs
def generate(self, X, y=None, n_outliers: int = 10, threshold_low_density: float = 0.1, max_samples: int = 100):
    """
    Generate data points belonging to low density regions.

    Pseudo code:
    - Standardize the data X
    - Estimate the density based upon the original data X
    - Computes a threshold for determining low density (so far 10th percentile)
    - Sample uniformly at random within the hypercube [min, max]
    - Estimate the density of the new points and filter out the ones with a density that is above the threshold

    :param X: the input features
    :param y: not used
    :param n_outliers: The number of outliers to generate
    :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
    :param max_samples:
    :return:
    """
    assert 0 < threshold_low_density < 1
    # standardize X
    scaler = StandardScaler()
    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)
    # fit density estimator
    self.density_estimator = self.density_estimator.fit(Xt)
    low_density_threshold = np.percentile(self.density_estimator.score_samples(Xt), threshold_low_density)

    if max_samples is None:
        max_samples = n_outliers * 100

    outliers = np.array([
        x
        for x in self.random_generator.uniform(
            low=np.min(Xt, axis=0),
            high=np.max(Xt, axis=0),
            size=(max_samples, Xt.shape[1])
        )
        if self.density_estimator.score_samples(x.reshape(1, -1)) <= low_density_threshold
    ])

    if outliers.shape[0] < n_outliers:
        warnings.warn(
            f'LowDensitySamplingGenerator could not generate all {n_outliers} outliers. It only generated {len(outliers)}.')
    else:
        outliers = outliers[:n_outliers]

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    # in the case no outliers could be generated
    if outliers.shape[0] == 0:
        return outliers, yt

    return scaler.inverse_transform(outliers), yt

OutliersGenerator

Bases: GeneratorMixin

Base class for transformers that add outliers to tabular data

Source code in badgers/generators/tabular_data/outliers.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class OutliersGenerator(GeneratorMixin):
    """
    Base class for transformers that add outliers to tabular data
    """

    def __init__(self, random_generator: np.random.Generator=default_rng(seed=0)):
        """
        :param random_generator: A random generator
        """
        self.random_generator = random_generator

    @abc.abstractmethod
    def generate(self, X, y=None, **params):
        pass

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/outliers.py
22
23
24
25
26
def __init__(self, random_generator: np.random.Generator=default_rng(seed=0)):
    """
    :param random_generator: A random generator
    """
    self.random_generator = random_generator

ZScoreSamplingGenerator

Bases: OutliersGenerator

Randomly generates outliers as data points with a z-score > 3.

Source code in badgers/generators/tabular_data/outliers.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class ZScoreSamplingGenerator(OutliersGenerator):
    """
    Randomly generates outliers as data points with a z-score > 3.
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """

        :param random_generator: A random generator
        """
        super().__init__(random_generator)

    @preprocess_inputs
    def generate(self, X, y, n_outliers: int = 10):
        """
        Randomly generates outliers as data points with a z-score > 3.

        1. Standardize the input data (mean = 0, variance = 1)
        3. Generate outliers as follows:
            - the sign is randomly chosen
            - for each dimension: the value is equal to 3 + a random number following an exponential distribution function
            with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
        4. Inverse the standardization transformation

        :param X: the input features
        :param y: the class labels, target values or None (if none yt
        :param n_outliers: The number of outliers to generate
        :return:
        """

        # standardize X
        scaler = StandardScaler()

        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)

        # generate outliers
        outliers = np.array([
            random_sign(self.random_generator, size=Xt.shape[1]) * (
                3. + self.random_generator.exponential(size=Xt.shape[1]))
            for _ in range(n_outliers)
        ])

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/outliers.py
38
39
40
41
42
43
def __init__(self, random_generator=default_rng(seed=0)):
    """

    :param random_generator: A random generator
    """
    super().__init__(random_generator)

generate(X, y, n_outliers=10)

Randomly generates outliers as data points with a z-score > 3.

  1. Standardize the input data (mean = 0, variance = 1)
  2. Generate outliers as follows:
    • the sign is randomly chosen
    • for each dimension: the value is equal to 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
  3. Inverse the standardization transformation

:param X: the input features :param y: the class labels, target values or None (if none yt :param n_outliers: The number of outliers to generate :return:

Source code in badgers/generators/tabular_data/outliers.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@preprocess_inputs
def generate(self, X, y, n_outliers: int = 10):
    """
    Randomly generates outliers as data points with a z-score > 3.

    1. Standardize the input data (mean = 0, variance = 1)
    3. Generate outliers as follows:
        - the sign is randomly chosen
        - for each dimension: the value is equal to 3 + a random number following an exponential distribution function
        with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
    4. Inverse the standardization transformation

    :param X: the input features
    :param y: the class labels, target values or None (if none yt
    :param n_outliers: The number of outliers to generate
    :return:
    """

    # standardize X
    scaler = StandardScaler()

    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)

    # generate outliers
    outliers = np.array([
        random_sign(self.random_generator, size=Xt.shape[1]) * (
            3. + self.random_generator.exponential(size=Xt.shape[1]))
        for _ in range(n_outliers)
    ])

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return scaler.inverse_transform(outliers), yt