Generating imbalancedness in tabular data¶
This tutorial shows how to generate imbalanced data on pre-existing tabular data and to visualize both the original and the transformed data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris, make_blobs
from badgers.generators.tabular_data.imbalance import RandomSamplingClassesGenerator, RandomSamplingTargetsGenerator, RandomSamplingFeaturesGenerator
from badgers.core.utils import normalize_proba
Setup random generator¶
from numpy.random import default_rng
seed = 0
rng = default_rng(seed)
Sample data from different classes¶
Here we are going to sample the dataset in a way that we will have 50% of all data coming from the first class, 5% coming form the second class, and 45% coming from the last class.
# load data
X, y = make_blobs(centers=4, random_state=0)
X = pd.DataFrame(data=X, columns=['dimension_0', 'dimension_1'])
y = pd.Series(y)
proportion_classes = {0:0.5, 1:0.05, 2:0.25, 3:0.2}
trf = RandomSamplingClassesGenerator(random_generator=rng)
Xt, yt = trf.generate(X=X.copy(),y=y,proportion_classes=proportion_classes)
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
sns.scatterplot(data=X, ax=axes[0], x='dimension_0', y='dimension_1', hue=y, palette=sns.color_palette("tab10")[:4], legend=True)
sns.scatterplot(data=Xt, ax=axes[1], x='dimension_0', y='dimension_1', hue=yt, palette=sns.color_palette("tab10")[:4], legend=True)
axes[0].set_title('Original')
axes[1].set_title('Transformed')
plt.tight_layout()
Sample data based upon the regression target¶
Here, we create some imbalanced data set by sampling data points propotionally to the regression target.
For this tutorial we create a fake regression target y
.
# generate fake regression target
y = -2*X['dimension_0'] + 3*X['dimension_1'] + 5
def sampling_proba_func(y):
# this simply normalize the regression target
return normalize_proba(
(y-np.min(y)) / (np.max(y)-np.min(y))
)
trf = RandomSamplingTargetsGenerator(random_generator=rng)
Xt,_ = trf.generate(X=X.copy(),y=y,sampling_proba_func=sampling_proba_func)
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(8,4))
sns.scatterplot(data=X, ax=axes[0], x='dimension_0', y='dimension_1', hue=y, legend=False)
sns.scatterplot(data=Xt, ax=axes[1], x='dimension_0', y='dimension_1', legend=False)
axes[0].set_title('Original')
axes[1].set_title('Transformed')
plt.tight_layout()
Sample data based upon the features themselves¶
Instead of using the classification labels or the regression target, we create an imbalanced data set in which we remove data points based upon some features.
In the example below we use the value of the first features X[:,0]
to determing whether a data point will be removed, therefore creating a kind of imbalancedness.
def sampling_proba_func(X):
feature = X['dimension_0']
return normalize_proba(
(feature - np.min(feature)) / (np.max(feature)-np.min(feature))
)
trf = RandomSamplingFeaturesGenerator(random_generator=rng)
Xt, _ = trf.generate(X=X.copy(), y=y, sampling_proba_func=sampling_proba_func)
fig, axes = plt.subplots(2,2, sharex=True, sharey=False, figsize=(8,8))
sns.scatterplot(data=X, x="dimension_0", y="dimension_1", hue=sampling_proba_func(X), ax=axes[0,0], legend=False)
sns.scatterplot(data=Xt, x="dimension_0", y="dimension_1", ax=axes[0,1])
sns.histplot(data=X, x="dimension_0", kde=True, ax=axes[1,0])
sns.histplot(data=Xt, x="dimension_0", kde=True, ax=axes[1,1])
axes[0,0].set_title('Original')
axes[0,1].set_title('Transformed')
plt.tight_layout();