Generating imbalancedness in tabular data¶

This tutorial shows how to generate imbalanced data on pre-existing tabular data and to visualize both the original and the transformed data

In [1]:

Copied!





import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.datasets import load_iris, make_blobs

from badgers.generators.tabular_data.imbalance import RandomSamplingClassesGenerator, RandomSamplingTargetsGenerator, RandomSamplingFeaturesGenerator
from badgers.core.utils import normalize_proba
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.datasets import load_iris, make_blobs

from badgers.generators.tabular_data.imbalance import RandomSamplingClassesGenerator, RandomSamplingTargetsGenerator, RandomSamplingFeaturesGenerator
from badgers.core.utils import normalize_proba

Setup random generator¶

In [2]:

Copied!

from numpy.random import default_rng
seed = 0
rng = default_rng(seed)
from numpy.random import default_rng
seed = 0
rng = default_rng(seed)

Sample data from different classes¶

Here we are going to sample the dataset in a way that we will have 50% of all data coming from the first class, 5% coming form the second class, and 45% coming from the last class.

In [3]:

Copied!





# load data
X, y = make_blobs(centers=4, random_state=0)
X = pd.DataFrame(data=X, columns=['dimension_0', 'dimension_1'])
y = pd.Series(y)
# load data
X, y = make_blobs(centers=4, random_state=0)
X = pd.DataFrame(data=X, columns=['dimension_0', 'dimension_1'])
y = pd.Series(y)

In [4]:

Copied!

proportion_classes = {0:0.5, 1:0.05, 2:0.25, 3:0.2}
trf = RandomSamplingClassesGenerator(random_generator=rng)
Xt, yt = trf.generate(X=X.copy(),y=y,proportion_classes=proportion_classes)
proportion_classes = {0:0.5, 1:0.05, 2:0.25, 3:0.2}
trf = RandomSamplingClassesGenerator(random_generator=rng)
Xt, yt = trf.generate(X=X.copy(),y=y,proportion_classes=proportion_classes)

In [5]:

Copied!





fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
sns.scatterplot(data=X, ax=axes[0], x='dimension_0', y='dimension_1', hue=y, palette=sns.color_palette("tab10")[:4], legend=True)
sns.scatterplot(data=Xt, ax=axes[1], x='dimension_0', y='dimension_1', hue=yt, palette=sns.color_palette("tab10")[:4], legend=True)
axes[0].set_title('Original')
axes[1].set_title('Transformed')
plt.tight_layout()
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
sns.scatterplot(data=X, ax=axes[0], x='dimension_0', y='dimension_1', hue=y, palette=sns.color_palette("tab10")[:4], legend=True)
sns.scatterplot(data=Xt, ax=axes[1], x='dimension_0', y='dimension_1', hue=yt, palette=sns.color_palette("tab10")[:4], legend=True)
axes[0].set_title('Original')
axes[1].set_title('Transformed')
plt.tight_layout()

No description has been provided for this image

Sample data based upon the regression target¶

Here, we create some imbalanced data set by sampling data points propotionally to the regression target.

For this tutorial we create a fake regression target y.

In [6]:

Copied!

# generate fake regression target
y = -2*X['dimension_0'] + 3*X['dimension_1'] + 5
# generate fake regression target
y = -2*X['dimension_0'] + 3*X['dimension_1'] + 5

In [7]:

Copied!





def sampling_proba_func(y):
    # this simply normalize the regression target
    return normalize_proba(
        (y-np.min(y)) / (np.max(y)-np.min(y))
    )

trf = RandomSamplingTargetsGenerator(random_generator=rng)
Xt,_ = trf.generate(X=X.copy(),y=y,sampling_proba_func=sampling_proba_func)
def sampling_proba_func(y):
    # this simply normalize the regression target
    return normalize_proba(
        (y-np.min(y)) / (np.max(y)-np.min(y))
    )

trf = RandomSamplingTargetsGenerator(random_generator=rng)
Xt,_ = trf.generate(X=X.copy(),y=y,sampling_proba_func=sampling_proba_func)

In [8]:

Copied!





fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
sns.scatterplot(data=X, ax=axes[0], x='dimension_0', y='dimension_1', hue=y, legend=False)
sns.scatterplot(data=Xt, ax=axes[1], x='dimension_0', y='dimension_1', legend=False)
axes[0].set_title('Original')
axes[1].set_title('Transformed')
plt.tight_layout()
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
sns.scatterplot(data=X, ax=axes[0], x='dimension_0', y='dimension_1', hue=y, legend=False)
sns.scatterplot(data=Xt, ax=axes[1], x='dimension_0', y='dimension_1', legend=False)
axes[0].set_title('Original')
axes[1].set_title('Transformed')
plt.tight_layout()

Sample data based upon the features themselves¶

Instead of using the classification labels or the regression target, we create an imbalanced data set in which we remove data points based upon some features.

In the example below we use the value of the first features X[:,0] to determing whether a data point will be removed, therefore creating a kind of imbalancedness.

In [9]:

Copied!





def sampling_proba_func(X):
    feature = X['dimension_0']
    return normalize_proba(
        (feature - np.min(feature)) / (np.max(feature)-np.min(feature))
    )

trf = RandomSamplingFeaturesGenerator(random_generator=rng)
Xt, _ = trf.generate(X=X.copy(), y=y, sampling_proba_func=sampling_proba_func)
def sampling_proba_func(X):
    feature = X['dimension_0']
    return normalize_proba(
        (feature - np.min(feature)) / (np.max(feature)-np.min(feature))
    )

trf = RandomSamplingFeaturesGenerator(random_generator=rng)
Xt, _ = trf.generate(X=X.copy(), y=y, sampling_proba_func=sampling_proba_func)

In [10]:

Copied!





fig, axes = plt.subplots(2,2, sharex=True, sharey=False, figsize=(8,8))
sns.scatterplot(data=X, x="dimension_0", y="dimension_1", hue=sampling_proba_func(X), ax=axes[0,0], legend=False)
sns.scatterplot(data=Xt, x="dimension_0", y="dimension_1", ax=axes[0,1])
sns.histplot(data=X, x="dimension_0", kde=True, ax=axes[1,0])
sns.histplot(data=Xt, x="dimension_0", kde=True, ax=axes[1,1])
axes[0,0].set_title('Original')
axes[0,1].set_title('Transformed')

plt.tight_layout();
fig, axes = plt.subplots(2,2, sharex=True, sharey=False, figsize=(8,8))
sns.scatterplot(data=X, x="dimension_0", y="dimension_1", hue=sampling_proba_func(X), ax=axes[0,0], legend=False)
sns.scatterplot(data=Xt, x="dimension_0", y="dimension_1", ax=axes[0,1])
sns.histplot(data=X, x="dimension_0", kde=True, ax=axes[1,0])
sns.histplot(data=Xt, x="dimension_0", kde=True, ax=axes[1,1])
axes[0,0].set_title('Original')
axes[0,1].set_title('Transformed')

plt.tight_layout();

In [ ]: