Generating noise in tabular data¶
This tutorial shows how to generate noise on pre-existing tabular data and to visualize both the original and the transformed data
In [1]:
Copied!
from sklearn.datasets import make_blobs
from badgers.generators.tabular_data.noise import GaussianNoiseGenerator, GaussianNoiseClassesGenerator
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from badgers.generators.tabular_data.noise import GaussianNoiseGenerator, GaussianNoiseClassesGenerator
import matplotlib.pyplot as plt
import numpy as np
In [2]:
Copied!
def plot_noise(X, y, Xt, yt):
"""
Some utility function to generate the plots
"""
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
for label in np.unique(y):
ix = np.where(y == label)
axes[0].scatter(X[ix,0],X[ix,1],c = f'C{label}', label = f'{label}')
for label in np.unique(yt):
ix = np.where(yt == label)
axes[1].scatter(Xt[ix,0],Xt[ix,1],c = f'C{label}', label = f'{label}')
axes[0].set_title('Original')
axes[1].set_title('Transformed')
axes[0].set_xlabel('dimension 0', fontsize=10)
axes[1].set_xlabel('dimension 0', fontsize=10)
axes[0].set_ylabel('dimension 1', fontsize=10)
axes[1].set_ylabel('dimension 1', fontsize=10)
axes[0].legend()
axes[1].legend()
return fig, axes
def plot_noise(X, y, Xt, yt):
"""
Some utility function to generate the plots
"""
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
for label in np.unique(y):
ix = np.where(y == label)
axes[0].scatter(X[ix,0],X[ix,1],c = f'C{label}', label = f'{label}')
for label in np.unique(yt):
ix = np.where(yt == label)
axes[1].scatter(Xt[ix,0],Xt[ix,1],c = f'C{label}', label = f'{label}')
axes[0].set_title('Original')
axes[1].set_title('Transformed')
axes[0].set_xlabel('dimension 0', fontsize=10)
axes[1].set_xlabel('dimension 0', fontsize=10)
axes[0].set_ylabel('dimension 1', fontsize=10)
axes[1].set_ylabel('dimension 1', fontsize=10)
axes[0].legend()
axes[1].legend()
return fig, axes
Load and prepare data¶
We first load an existing dataset from sklearn.datasets
In [3]:
Copied!
X, y = make_blobs(centers=4, random_state=0, cluster_std=0.25)
X, y = make_blobs(centers=4, random_state=0, cluster_std=0.25)
Generate noise¶
The transformer applies an additive Gaussian noise to each dimension
In [4]:
Copied!
trf = GaussianNoiseGenerator(noise_std=0.25)
Xt, yt = trf.generate(X.copy(), y)
trf = GaussianNoiseGenerator(noise_std=0.25)
Xt, yt = trf.generate(X.copy(), y)
In [5]:
Copied!
fig, axes = plot_noise(X, y, Xt, yt)
fig, axes = plot_noise(X, y, Xt, yt)
Generate noise for each class separately¶
The transformer applies an additive Gaussian noise to each dimension for each class separately.
In [6]:
Copied!
trf = GaussianNoiseClassesGenerator(noise_std_per_class={0:0.1, 1:0.2, 2:0.3, 3:0.4})
Xt, yt = trf.generate(X.copy(), y)
trf = GaussianNoiseClassesGenerator(noise_std_per_class={0:0.1, 1:0.2, 2:0.3, 3:0.4})
Xt, yt = trf.generate(X.copy(), y)
In [7]:
Copied!
fig, axes = plot_noise(X, y, Xt, yt)
fig, axes = plot_noise(X, y, Xt, yt)