Skip to content

outliers

DecompositionAndOutlierGenerator

Bases: OutliersGenerator

Source code in badgers/generators/tabular_data/outliers.py
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
class DecompositionAndOutlierGenerator(OutliersGenerator):

    def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin = PCA(n_components=2),
                 outlier_generator: OutliersGenerator = ZScoreSamplingGenerator(default_rng(0),
                                                                                n_outliers=10)):
        """

        :param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer
        :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced
        """
        assert hasattr(
            decomposition_transformer,
            'inverse_transform'), \
            f'the decomposition transformer class must implement the inverse_transform function.' \
            f'\nUnfortunately the class {decomposition_transformer} does not'
        super().__init__(random_generator=outlier_generator.random_generator,
                         n_outliers=outlier_generator.n_outliers)

        self.decomposition_transformer = decomposition_transformer
        self.outlier_generator = outlier_generator

    def generate(self, X, y=None, **params):
        """
        Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
        and an outlier transformer.

        1. Standardize the input data (mean = 0, variance = 1)
        2. Apply the dimensionality reduction transformer
        3. Generates outliers by applying the outlier transformer
        4. Inverse the dimensionality reduction and the standardization transformations

        :param X: the input features
        :param y: the regression target, class labels, or None
        :param params:
        :return:
        """

        # standardize the data and apply the dimensionality reduction transformer
        pipeline = make_pipeline(
            StandardScaler(),
            self.decomposition_transformer,
        )
        Xt = pipeline.fit_transform(X)
        # add outliers using the zscore_transformer
        Xt, yt = self.outlier_generator.generate(Xt, y)
        # inverse the manifold and standardization transformations
        return pipeline.inverse_transform(Xt), yt

__init__(decomposition_transformer=PCA(n_components=2), outlier_generator=ZScoreSamplingGenerator(default_rng(0), n_outliers=10))

:param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced

Source code in badgers/generators/tabular_data/outliers.py
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def __init__(self, decomposition_transformer: sklearn.base.TransformerMixin = PCA(n_components=2),
             outlier_generator: OutliersGenerator = ZScoreSamplingGenerator(default_rng(0),
                                                                            n_outliers=10)):
    """

    :param decomposition_transformer: The dimensionality reduction transformer to be used before the outlier transformer
    :param outlier_generator: The outlier transformer to be used after the dimensionality has been reduced
    """
    assert hasattr(
        decomposition_transformer,
        'inverse_transform'), \
        f'the decomposition transformer class must implement the inverse_transform function.' \
        f'\nUnfortunately the class {decomposition_transformer} does not'
    super().__init__(random_generator=outlier_generator.random_generator,
                     n_outliers=outlier_generator.n_outliers)

    self.decomposition_transformer = decomposition_transformer
    self.outlier_generator = outlier_generator

generate(X, y=None, **params)

Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition) and an outlier transformer.

  1. Standardize the input data (mean = 0, variance = 1)
  2. Apply the dimensionality reduction transformer
  3. Generates outliers by applying the outlier transformer
  4. Inverse the dimensionality reduction and the standardization transformations

:param X: the input features :param y: the regression target, class labels, or None :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def generate(self, X, y=None, **params):
    """
    Randomly generate outliers by first applying a dimensionality reduction technique (sklearn.decomposition)
    and an outlier transformer.

    1. Standardize the input data (mean = 0, variance = 1)
    2. Apply the dimensionality reduction transformer
    3. Generates outliers by applying the outlier transformer
    4. Inverse the dimensionality reduction and the standardization transformations

    :param X: the input features
    :param y: the regression target, class labels, or None
    :param params:
    :return:
    """

    # standardize the data and apply the dimensionality reduction transformer
    pipeline = make_pipeline(
        StandardScaler(),
        self.decomposition_transformer,
    )
    Xt = pipeline.fit_transform(X)
    # add outliers using the zscore_transformer
    Xt, yt = self.outlier_generator.generate(Xt, y)
    # inverse the manifold and standardization transformations
    return pipeline.inverse_transform(Xt), yt

HistogramSamplingGenerator

Bases: OutliersGenerator

Randomly generates outliers from low density regions. Low density regions are estimated through a histogram.


WARNING: This computes a full histogram in d-dimensions (d = nb features / columns), which is O(d²). Should only be used with low dimensionality data! It will raise an error if the number of dimensions is greater than 5.


TODO: this works but is very inefficient, better strategies are welcome!

Source code in badgers/generators/tabular_data/outliers.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
class HistogramSamplingGenerator(OutliersGenerator):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated through a histogram.

    -----------------------------------------
    WARNING:
    This computes a full histogram in d-dimensions (d = nb features / columns), which is O(d²).
    Should only be used with low dimensionality data!
    It will raise an error if the number of dimensions is greater than 5.
    -----------------------------------------

    TODO: this works but is very inefficient, better strategies are welcome!
    """

    def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10,
                 threshold_low_density: float = 0.1, bins: int = 10):
        """

        :param random_generator: A random generator
        :param n_outliers: The number of outliers to generate
        :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
        :param bins: number of bins for the histogram
        """
        assert 0 < threshold_low_density < 1
        super().__init__(random_generator, n_outliers)
        self.threshold_low_density = threshold_low_density
        self.bins = bins

    def generate(self, X, y=None, **params):
        """
        Randomly generates outliers from low density regions. Low density regions are estimated through histograms

        1. Standardize the input data (mean = 0, variance = 1)
        2. Compute and normalize histogram for the data
        3. Sample datapoint uniformly at random within bins of low density
        4. Inverse the standardization transformation

        :param X: the input features
        :param y: not used
        :param params:
        :return:
        """
        if X.shape[1] > 5:
            raise NotImplementedError('So far this generator only supports tabular data with at most 5 columns')
        # standardize X
        scaler = StandardScaler()
        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)

        # compute the histogram of the data
        hist, edges = np.histogramdd(Xt, density=False, bins=self.bins)
        # normalize
        norm_hist = hist / (np.max(hist) - np.min(hist))
        # get coordinates of the histogram where the density is low (below a certain threshold)
        hist_coords_low_density = np.where(norm_hist <= self.threshold_low_density)
        # randomly pick some coordinates in the histogram where the density is low
        hist_coords_random = self.random_generator.choice(list(zip(*hist_coords_low_density)), self.n_outliers,
                                                          replace=True)

        # computing outliers values
        outliers = np.array([
            [
                self.random_generator.uniform(low=edges[i][c], high=edges[i][c + 1])
                for i, c in enumerate(h_coords)
            ]
            for h_coords in hist_coords_random
        ])

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0), n_outliers=10, threshold_low_density=0.1, bins=10)

:param random_generator: A random generator :param n_outliers: The number of outliers to generate :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1) :param bins: number of bins for the histogram

Source code in badgers/generators/tabular_data/outliers.py
219
220
221
222
223
224
225
226
227
228
229
230
231
def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10,
             threshold_low_density: float = 0.1, bins: int = 10):
    """

    :param random_generator: A random generator
    :param n_outliers: The number of outliers to generate
    :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
    :param bins: number of bins for the histogram
    """
    assert 0 < threshold_low_density < 1
    super().__init__(random_generator, n_outliers)
    self.threshold_low_density = threshold_low_density
    self.bins = bins

generate(X, y=None, **params)

Randomly generates outliers from low density regions. Low density regions are estimated through histograms

  1. Standardize the input data (mean = 0, variance = 1)
  2. Compute and normalize histogram for the data
  3. Sample datapoint uniformly at random within bins of low density
  4. Inverse the standardization transformation

:param X: the input features :param y: not used :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def generate(self, X, y=None, **params):
    """
    Randomly generates outliers from low density regions. Low density regions are estimated through histograms

    1. Standardize the input data (mean = 0, variance = 1)
    2. Compute and normalize histogram for the data
    3. Sample datapoint uniformly at random within bins of low density
    4. Inverse the standardization transformation

    :param X: the input features
    :param y: not used
    :param params:
    :return:
    """
    if X.shape[1] > 5:
        raise NotImplementedError('So far this generator only supports tabular data with at most 5 columns')
    # standardize X
    scaler = StandardScaler()
    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)

    # compute the histogram of the data
    hist, edges = np.histogramdd(Xt, density=False, bins=self.bins)
    # normalize
    norm_hist = hist / (np.max(hist) - np.min(hist))
    # get coordinates of the histogram where the density is low (below a certain threshold)
    hist_coords_low_density = np.where(norm_hist <= self.threshold_low_density)
    # randomly pick some coordinates in the histogram where the density is low
    hist_coords_random = self.random_generator.choice(list(zip(*hist_coords_low_density)), self.n_outliers,
                                                      replace=True)

    # computing outliers values
    outliers = np.array([
        [
            self.random_generator.uniform(low=edges[i][c], high=edges[i][c + 1])
            for i, c in enumerate(h_coords)
        ]
        for h_coords in hist_coords_random
    ])

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return scaler.inverse_transform(outliers), yt

HypersphereSamplingGenerator

Bases: OutliersGenerator

Generates outliers by sampling points from a hypersphere with radius at least 3 sigma

Source code in badgers/generators/tabular_data/outliers.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class HypersphereSamplingGenerator(OutliersGenerator):
    """
    Generates outliers by sampling points from a hypersphere with radius at least 3 sigma
    """

    def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10):
        """

        :param random_generator: A random generator
        :param n_outliers: The number of outliers to generate
        """
        super().__init__(random_generator, n_outliers)

    def generate(self, X, y=None, **params):
        """
        Randomly generates outliers as data points with a z-score > 3.

        1. Standardize the input data (mean = 0, variance = 1)
        3. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates):
            - angles are chosen uniformly at random
            - radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
        4. Inverse the standardization transformation

        :param X: the input features
        :param y: not used
        :param params:
        :return:
        """

        # standardize X
        scaler = StandardScaler()

        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)

        # computing outliers
        outliers = np.array([
            random_spherical_coordinate(
                random_generator=self.random_generator,
                size=Xt.shape[1],
                radius=3. + self.random_generator.exponential()
            )
            for _ in range(self.n_outliers)
        ])

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0), n_outliers=10)

:param random_generator: A random generator :param n_outliers: The number of outliers to generate

Source code in badgers/generators/tabular_data/outliers.py
 94
 95
 96
 97
 98
 99
100
def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10):
    """

    :param random_generator: A random generator
    :param n_outliers: The number of outliers to generate
    """
    super().__init__(random_generator, n_outliers)

generate(X, y=None, **params)

Randomly generates outliers as data points with a z-score > 3.

  1. Standardize the input data (mean = 0, variance = 1)
  2. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates):
    • angles are chosen uniformly at random
    • radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
  3. Inverse the standardization transformation

:param X: the input features :param y: not used :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def generate(self, X, y=None, **params):
    """
    Randomly generates outliers as data points with a z-score > 3.

    1. Standardize the input data (mean = 0, variance = 1)
    3. Generate outliers on a hypersphere (see https://en.wikipedia.org/wiki/N-sphere#Spherical_coordinates):
        - angles are chosen uniformly at random
        - radius is = 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
    4. Inverse the standardization transformation

    :param X: the input features
    :param y: not used
    :param params:
    :return:
    """

    # standardize X
    scaler = StandardScaler()

    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)

    # computing outliers
    outliers = np.array([
        random_spherical_coordinate(
            random_generator=self.random_generator,
            size=Xt.shape[1],
            radius=3. + self.random_generator.exponential()
        )
        for _ in range(self.n_outliers)
    ])

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return scaler.inverse_transform(outliers), yt

IndependentHistogramsGenerator

Bases: OutliersGenerator

Randomly generates outliers from low density regions. Low density regions are estimated through several independent histograms (one for each feature).

For each feature (column), a histogram is computed (it approximates the marginal distribution). Values are generated from bins with a low number of data points.

All values generated for each feature are simply concatenated (independence hypothesis!).

Source code in badgers/generators/tabular_data/outliers.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class IndependentHistogramsGenerator(OutliersGenerator):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated through several independent histograms (one for each feature).

    For each feature (column), a histogram is computed (it approximates the marginal distribution).
    Values are generated from bins with a low number of data points.

    All values generated for each feature are simply concatenated (independence hypothesis!).
    """

    def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10, bins: int = 10):
        super().__init__(random_generator=random_generator, n_outliers=n_outliers)
        self.bins = bins

    @numpy_API
    def generate(self, X, y=None, **params):
        """
        Randomly generates outliers from low density regions.
        Low density regions are estimated through several independent histograms (one for each feature).

        For each feature (column), a histogram is computed (it approximates the marginal distribution).
        Values are generated from bins with a low number of data points.

        All values generated for each feature are simply concatenated (independence hypothesis!).

        :param X: the input features
        :param y: not used
        :param params:
        :return:
        """
        outliers = []

        # loop over all features (columns)
        for col in range(X.shape[1]):
            # create an empty array for storing the generated values for this column
            values = np.zeros(self.n_outliers)
            # compute histogram of the current feature
            hist, bin_edges = np.histogram(X[:, col], bins=self.bins)
            # compute inverse density
            inv_density = 1 - hist / np.max(hist)
            # the sampling probability is proportional to the inverse density
            p = inv_density / np.sum(inv_density)
            # generate values:
            # first, choose randomly from which bin the value must be sampled
            indices = self.random_generator.choice(self.bins, p=p, size=self.n_outliers, replace=True)
            # second, sample uniformly at random from the selected bin
            values = [self.random_generator.uniform(low=bin_edges[i], high=bin_edges[i + 1]) for i in indices]
            # append the values for the current feature
            outliers.append(values)
        # cast as a numpy array
        outliers = np.array(outliers).T

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return outliers, yt

generate(X, y=None, **params)

Randomly generates outliers from low density regions. Low density regions are estimated through several independent histograms (one for each feature).

For each feature (column), a histogram is computed (it approximates the marginal distribution). Values are generated from bins with a low number of data points.

All values generated for each feature are simply concatenated (independence hypothesis!).

:param X: the input features :param y: not used :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
@numpy_API
def generate(self, X, y=None, **params):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated through several independent histograms (one for each feature).

    For each feature (column), a histogram is computed (it approximates the marginal distribution).
    Values are generated from bins with a low number of data points.

    All values generated for each feature are simply concatenated (independence hypothesis!).

    :param X: the input features
    :param y: not used
    :param params:
    :return:
    """
    outliers = []

    # loop over all features (columns)
    for col in range(X.shape[1]):
        # create an empty array for storing the generated values for this column
        values = np.zeros(self.n_outliers)
        # compute histogram of the current feature
        hist, bin_edges = np.histogram(X[:, col], bins=self.bins)
        # compute inverse density
        inv_density = 1 - hist / np.max(hist)
        # the sampling probability is proportional to the inverse density
        p = inv_density / np.sum(inv_density)
        # generate values:
        # first, choose randomly from which bin the value must be sampled
        indices = self.random_generator.choice(self.bins, p=p, size=self.n_outliers, replace=True)
        # second, sample uniformly at random from the selected bin
        values = [self.random_generator.uniform(low=bin_edges[i], high=bin_edges[i + 1]) for i in indices]
        # append the values for the current feature
        outliers.append(values)
    # cast as a numpy array
    outliers = np.array(outliers).T

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return outliers, yt

LowDensitySamplingGenerator

Bases: OutliersGenerator

Randomly generates outliers from low density regions. Low density regions are estimated using a KernelDensity estimator. Points are sampled uniformly at random and filtered out if they do not belong to a low density region

TODO: this works but might not be efficient, a better sampling strategy is welcome

Source code in badgers/generators/tabular_data/outliers.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
class LowDensitySamplingGenerator(OutliersGenerator):
    """
    Randomly generates outliers from low density regions.
    Low density regions are estimated using a KernelDensity estimator.
    Points are sampled uniformly at random and filtered out if they do not belong to a low density region

    TODO: this works but might not be efficient, a better sampling strategy is welcome
    """

    def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10, threshold_low_density: float = 0.1):
        """

        :param random_generator: A random generator
        :param n_outliers: The number of outliers to generate
        :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
        """
        super().__init__(random_generator=random_generator, n_outliers=n_outliers)
        self.density_estimator = KernelDensity(bandwidth="scott")
        self.threshold_low_density = threshold_low_density

    def generate(self, X, y=None, **params):
        """
        Generate data points belonging to low density regions.

        Pseudo code:
        - Standardize the data X
        - Estimate the density based upon the original data X
        - Computes a threshold for determining low density (so far 10th percentile)
        - Sample uniformly at random within the hypercube [min, max]
        - Estimate the density of the new points and filter out the ones with a density that is above the threshold

        :param X: the input features
        :param y: not used
        :param params:
        :return:
        """
        # standardize X
        scaler = StandardScaler()
        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)
        # fit density estimator
        self.density_estimator = self.density_estimator.fit(Xt)
        low_density_threshold = np.percentile(self.density_estimator.score_samples(Xt), self.threshold_low_density)

        if params.get('max_samples') is not None:
            max_samples = params['max_samples']
        else:
            max_samples = self.n_outliers * 100

        outliers = np.array([
            x
            for x in self.random_generator.uniform(
                low=np.min(Xt, axis=0),
                high=np.max(Xt, axis=0),
                size=(max_samples, Xt.shape[1])
            )
            if self.density_estimator.score_samples(x.reshape(1, -1)) <= low_density_threshold
        ])

        if outliers.shape[0] < self.n_outliers:
            warnings.warn(
                f'LowDensitySamplingGenerator could not generate all {self.n_outliers} outliers. It only generated {len(outliers)}.')
        else:
            outliers = outliers[:self.n_outliers]

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        # in the case no outliers could be generated
        if outliers.shape[0] == 0:
            return outliers, yt

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0), n_outliers=10, threshold_low_density=0.1)

:param random_generator: A random generator :param n_outliers: The number of outliers to generate :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)

Source code in badgers/generators/tabular_data/outliers.py
293
294
295
296
297
298
299
300
301
302
def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10, threshold_low_density: float = 0.1):
    """

    :param random_generator: A random generator
    :param n_outliers: The number of outliers to generate
    :param threshold_low_density: the threshold that defines a low density region (must be between 0 and 1)
    """
    super().__init__(random_generator=random_generator, n_outliers=n_outliers)
    self.density_estimator = KernelDensity(bandwidth="scott")
    self.threshold_low_density = threshold_low_density

generate(X, y=None, **params)

Generate data points belonging to low density regions.

Pseudo code: - Standardize the data X - Estimate the density based upon the original data X - Computes a threshold for determining low density (so far 10th percentile) - Sample uniformly at random within the hypercube [min, max] - Estimate the density of the new points and filter out the ones with a density that is above the threshold

:param X: the input features :param y: not used :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def generate(self, X, y=None, **params):
    """
    Generate data points belonging to low density regions.

    Pseudo code:
    - Standardize the data X
    - Estimate the density based upon the original data X
    - Computes a threshold for determining low density (so far 10th percentile)
    - Sample uniformly at random within the hypercube [min, max]
    - Estimate the density of the new points and filter out the ones with a density that is above the threshold

    :param X: the input features
    :param y: not used
    :param params:
    :return:
    """
    # standardize X
    scaler = StandardScaler()
    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)
    # fit density estimator
    self.density_estimator = self.density_estimator.fit(Xt)
    low_density_threshold = np.percentile(self.density_estimator.score_samples(Xt), self.threshold_low_density)

    if params.get('max_samples') is not None:
        max_samples = params['max_samples']
    else:
        max_samples = self.n_outliers * 100

    outliers = np.array([
        x
        for x in self.random_generator.uniform(
            low=np.min(Xt, axis=0),
            high=np.max(Xt, axis=0),
            size=(max_samples, Xt.shape[1])
        )
        if self.density_estimator.score_samples(x.reshape(1, -1)) <= low_density_threshold
    ])

    if outliers.shape[0] < self.n_outliers:
        warnings.warn(
            f'LowDensitySamplingGenerator could not generate all {self.n_outliers} outliers. It only generated {len(outliers)}.')
    else:
        outliers = outliers[:self.n_outliers]

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    # in the case no outliers could be generated
    if outliers.shape[0] == 0:
        return outliers, yt

    return scaler.inverse_transform(outliers), yt

OutliersGenerator

Bases: GeneratorMixin

Base class for transformers that add outliers to tabular data

Source code in badgers/generators/tabular_data/outliers.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class OutliersGenerator(GeneratorMixin):
    """
    Base class for transformers that add outliers to tabular data
    """

    def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10):
        """
        :param random_generator: A random generator
        :param n_outliers: The number of outliers to generate
        """
        self.random_generator = random_generator
        self.n_outliers = n_outliers

    @abc.abstractmethod
    def generate(self, X, y=None, **params):
        pass

__init__(random_generator=default_rng(seed=0), n_outliers=10)

:param random_generator: A random generator :param n_outliers: The number of outliers to generate

Source code in badgers/generators/tabular_data/outliers.py
22
23
24
25
26
27
28
def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10):
    """
    :param random_generator: A random generator
    :param n_outliers: The number of outliers to generate
    """
    self.random_generator = random_generator
    self.n_outliers = n_outliers

ZScoreSamplingGenerator

Bases: OutliersGenerator

Randomly generates outliers as data points with a z-score > 3.

Source code in badgers/generators/tabular_data/outliers.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class ZScoreSamplingGenerator(OutliersGenerator):
    """
    Randomly generates outliers as data points with a z-score > 3.
    """

    def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10):
        """

        :param random_generator: A random generator
        :param n_outliers: The number of outliers to generate
        """
        super().__init__(random_generator, n_outliers)

    def generate(self, X, y=None, **params):
        """
        Randomly generates outliers as data points with a z-score > 3.

        1. Standardize the input data (mean = 0, variance = 1)
        3. Generate outliers as follows:
            - the sign is randomly chosen
            - for each dimension: the value is equal to 3 + a random number following an exponential distribution function
            with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
        4. Inverse the standardization transformation

        :param X: the input features
        :param y: not used
        :param params:
        :return:
        """

        # standardize X
        scaler = StandardScaler()

        # fit, transform
        scaler.fit(X)
        Xt = scaler.transform(X)

        # generate outliers
        outliers = np.array([
            random_sign(self.random_generator, size=Xt.shape[1]) * (
                3. + self.random_generator.exponential(size=Xt.shape[1]))
            for _ in range(self.n_outliers)
        ])

        # in case we only have 1 outlier, reshape the array to match sklearn convention
        if outliers.shape[0] == 1:
            outliers = outliers.reshape(1, -1)

        # add "outliers" as labels for outliers
        yt = np.array(["outliers"] * len(outliers))

        return scaler.inverse_transform(outliers), yt

__init__(random_generator=default_rng(seed=0), n_outliers=10)

:param random_generator: A random generator :param n_outliers: The number of outliers to generate

Source code in badgers/generators/tabular_data/outliers.py
40
41
42
43
44
45
46
def __init__(self, random_generator=default_rng(seed=0), n_outliers: int = 10):
    """

    :param random_generator: A random generator
    :param n_outliers: The number of outliers to generate
    """
    super().__init__(random_generator, n_outliers)

generate(X, y=None, **params)

Randomly generates outliers as data points with a z-score > 3.

  1. Standardize the input data (mean = 0, variance = 1)
  2. Generate outliers as follows:
    • the sign is randomly chosen
    • for each dimension: the value is equal to 3 + a random number following an exponential distribution function with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
  3. Inverse the standardization transformation

:param X: the input features :param y: not used :param params: :return:

Source code in badgers/generators/tabular_data/outliers.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def generate(self, X, y=None, **params):
    """
    Randomly generates outliers as data points with a z-score > 3.

    1. Standardize the input data (mean = 0, variance = 1)
    3. Generate outliers as follows:
        - the sign is randomly chosen
        - for each dimension: the value is equal to 3 + a random number following an exponential distribution function
        with default parameters (see https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.exponential.html)
    4. Inverse the standardization transformation

    :param X: the input features
    :param y: not used
    :param params:
    :return:
    """

    # standardize X
    scaler = StandardScaler()

    # fit, transform
    scaler.fit(X)
    Xt = scaler.transform(X)

    # generate outliers
    outliers = np.array([
        random_sign(self.random_generator, size=Xt.shape[1]) * (
            3. + self.random_generator.exponential(size=Xt.shape[1]))
        for _ in range(self.n_outliers)
    ])

    # in case we only have 1 outlier, reshape the array to match sklearn convention
    if outliers.shape[0] == 1:
        outliers = outliers.reshape(1, -1)

    # add "outliers" as labels for outliers
    yt = np.array(["outliers"] * len(outliers))

    return scaler.inverse_transform(outliers), yt