Generating outliers in tabular data¶
This tutorial shows how to generate outliers (extreme values) by generating data points with a z-score greater than 3.
from badgers.generators.tabular_data.outliers import *
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
def plot_outliers(X, outliers, y):
"""
Some utility function to generate the plots
"""
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
# plot original data
for i in range(len(np.unique(y))):
ix = np.where(y == i)
axes[0].scatter(X[ix,0],X[ix,1], c = f'C{i}', label = f'{i}')
axes[1].scatter(X[ix,0],X[ix,1], c = f'C{i}', label = f'{i}')
# add transformed outliers
axes[1].scatter(outliers[:,0],outliers[:,1], marker = 'x', c = f'black', label = 'outliers')
# titles and co
axes[0].set_title('Original')
axes[1].set_title('Transformed')
axes[0].set_xlabel('1st dimension')
axes[0].set_ylabel('2nd dimension')
axes[1].set_xlabel('1st dimension')
axes[1].set_ylabel('2nd dimension')
axes[1].legend(ncol=1, bbox_to_anchor=(1, 1))
plt.tight_layout()
return fig, axes
Load and prepare data¶
We first load an existing dataset from sklearn.datasets
X, y = make_blobs(centers=4, cluster_std=0.50, random_state=0)
trf = ZScoreSamplingGenerator(n_outliers=10)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Generate outliers using hypersphere sampling¶
The transformer generates data points on a hypersphere of radius greater than 3 sigmas
trf = HypersphereSamplingGenerator(n_outliers=25)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Generate outliers using independent histogram sampling¶
trf = IndependentHistogramsGenerator(n_outliers=20)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Generate outliers using histogram sampling¶
Note this only works for datasets with low dimensionality (5 dimensions or less). If you wish to apply it with a dataset with more than 5 dimensions, first apply a dimensionality reduction technique.
trf = HistogramSamplingGenerator(n_outliers=20)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Generate outliers using low density sampling¶
trf = LowDensitySamplingGenerator(n_outliers=10, threshold_low_density=0.25)
outliers, _ = trf.generate(X.copy(), y, max_samples=100)
fig, axes = plot_outliers(X, outliers, y)
Generate outliers by first reducing the dimensions and then apply an outlier transformer¶
Dimensionality reduction and z-score sampling¶
Here are a couple of examples on how to generate outliers by first applying dimensionality reduction methods from the slearn.decomposition module (like PCA, KernelPCA, etc.) and then applying the ZScore transformer
# generate some data with higher dimensionality
X, y = make_blobs(n_features=10, centers=4, cluster_std=0.60, random_state=0)
from sklearn.decomposition import PCA, KernelPCA, FastICA
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=PCA(n_components=3),
outlier_generator=ZScoreSamplingGenerator(n_outliers=20)
)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Instread of PCA one can use any class from sklearn.decomposition module that provides a inverse_transform
method.
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=KernelPCA(n_components=3, fit_inverse_transform=True),
outlier_generator=ZScoreSamplingGenerator(n_outliers=20)
)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Here is yet another example specifying keywords arguments for the decomposition method
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=FastICA(n_components=3, whiten='unit-variance'),
outlier_generator=ZScoreSamplingGenerator(n_outliers=20)
)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Using hypersphere sampling with dimension reduction techniques¶
Again one can first apply a dimensionality reduction techniques first and then apply the transformer
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=FastICA(n_components=3, whiten='unit-variance'),
outlier_generator=HypersphereSamplingGenerator(n_outliers=20)
)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)
Using histogram sampling with dimension reduction techniques¶
Here again, one can first apply a dimensionality reduction techniques first and then apply the transformer
trf = DecompositionAndOutlierGenerator(
decomposition_transformer=FastICA(n_components=3, whiten='unit-variance'),
outlier_generator=HistogramSamplingGenerator(n_outliers=20)
)
outliers, _ = trf.generate(X.copy(), y)
fig, axes = plot_outliers(X, outliers, y)