Generating outliers in tabular data¶
This tutorial shows how to generate outliers (extreme values) by generating data points with a z-score greater than 3.
from badgers.generators.tabular_data.outliers import *
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def plot_outliers(X, outliers, y):
"""
Some utility function to generate the plots
"""
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
# plot original data
for i in range(len(np.unique(y))):
ix = np.where(y == i)
axes[0].scatter(X[ix,0],X[ix,1], c = f'C{i}', label = f'{i}')
axes[1].scatter(X[ix,0],X[ix,1], c = f'C{i}', label = f'{i}')
# add transformed outliers
axes[1].scatter(outliers[:,0],outliers[:,1], marker = 'x', c = f'black', label = 'outliers')
# titles and co
axes[0].set_title('Original')
axes[1].set_title('Transformed')
axes[0].set_xlabel('1st dimension')
axes[0].set_ylabel('2nd dimension')
axes[1].set_xlabel('1st dimension')
axes[1].set_ylabel('2nd dimension')
axes[1].legend(ncol=1, bbox_to_anchor=(1, 1))
plt.tight_layout()
return fig, axes
Setup random generator¶
from numpy.random import default_rng
seed = 0
rng = default_rng(seed)
Load and prepare data¶
We first load an existing dataset from sklearn.datasets
# load data
X, y = make_blobs(centers=4, random_state=0, cluster_std=0.5)
X = pd.DataFrame(data=X, columns=['dimension_0', 'dimension_1'])
y = pd.Series(y)
trf = ZScoreSamplingGenerator(random_generator=rng)
outliers, _ = trf.generate(X.copy(), y=y, n_outliers=25)
fig, axes = plot_outliers(X.values, outliers, y.values)
Generate outliers using hypersphere sampling¶
The transformer generates data points on a hypersphere of radius greater than 3 sigmas
trf = HypersphereSamplingGenerator(random_generator=rng)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X.values, outliers, y.values)
Generate outliers using independent histogram sampling¶
trf = IndependentHistogramsGenerator(random_generator=rng)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X.values, outliers, y.values)
Generate outliers using histogram sampling¶
Note this only works for datasets with low dimensionality (5 dimensions or less). If you wish to apply it with a dataset with more than 5 dimensions, first apply a dimensionality reduction technique.
trf = HistogramSamplingGenerator(random_generator=rng)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X.values, outliers, y.values)
Generate outliers using low density sampling¶
trf = LowDensitySamplingGenerator(random_generator=rng)
outliers, _ = trf.generate(X.copy(), y, max_samples=100, n_outliers=25, threshold_low_density=0.25)
fig, axes = plot_outliers(X.values, outliers, y.values)
Generate outliers by first reducing the dimensions and then apply an outlier transformer¶
Dimensionality reduction and z-score sampling¶
Here are a couple of examples on how to generate outliers by first applying dimensionality reduction methods from the slearn.decomposition module (like PCA, KernelPCA, etc.) and then applying the ZScore transformer
# generate some data with higher dimensionality
X, y = make_blobs(n_features=10, centers=4, cluster_std=0.60, random_state=0)
from sklearn.decomposition import PCA, KernelPCA, FastICA
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=PCA(n_components=3),
outlier_generator=ZScoreSamplingGenerator()
)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X, outliers, y)
Instread of PCA one can use any class from sklearn.decomposition module that provides a inverse_transform
method.
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=KernelPCA(n_components=3, fit_inverse_transform=True),
outlier_generator=ZScoreSamplingGenerator()
)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X, outliers, y)
Here is yet another example specifying keywords arguments for the decomposition method
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=FastICA(n_components=3, whiten='unit-variance'),
outlier_generator=ZScoreSamplingGenerator()
)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X, outliers, y)
Using hypersphere sampling with dimension reduction techniques¶
Again one can first apply a dimensionality reduction techniques first and then apply the transformer
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=FastICA(n_components=3, whiten='unit-variance'),
outlier_generator=HypersphereSamplingGenerator()
)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X, outliers, y)
Using histogram sampling with dimension reduction techniques¶
Here again, one can first apply a dimensionality reduction techniques first and then apply the transformer
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=FastICA(n_components=3, whiten='unit-variance'),
outlier_generator=HistogramSamplingGenerator()
)
outliers, _ = trf.generate(X.copy(), y, n_outliers=25)
fig, axes = plot_outliers(X, outliers, y)