Skip to content

imbalance

ImbalanceGenerator

Bases: GeneratorMixin

Base class for transformers that makes tabular data imbalanced

Source code in badgers/generators/tabular_data/imbalance.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class ImbalanceGenerator(GeneratorMixin):
    """
    Base class for transformers that makes tabular data imbalanced
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        :param random_generator: A random generator
        """
        self.random_generator = random_generator

    @abc.abstractmethod
    def generate(self, X, y=None, **params):
        pass

__init__(random_generator=default_rng(seed=0))

:param random_generator: A random generator

Source code in badgers/generators/tabular_data/imbalance.py
16
17
18
19
20
def __init__(self, random_generator=default_rng(seed=0)):
    """
    :param random_generator: A random generator
    """
    self.random_generator = random_generator

RandomSamplingClassesGenerator

Bases: ImbalanceGenerator

Randomly samples data points within predefined classes

Source code in badgers/generators/tabular_data/imbalance.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class RandomSamplingClassesGenerator(ImbalanceGenerator):
    """
    Randomly samples data points within predefined classes
    """

    def __init__(self, random_generator=default_rng(seed=0), proportion_classes: dict = None):
        """

        :param random_generator: A random generator
        :param proportion_classes: Example for having in total 50% of class 'A', 30% of class 'B', and 20% of class 'C'
            proportion_classes={'A':0.5, 'B':0.3, 'C':0.2}
        """
        super().__init__(random_generator=random_generator)
        self.transformed_labels_ = None
        self.proportion_classes = proportion_classes

    @numpy_API
    def generate(self, X, y, **params):
        """
        Randomly samples instances for each classes

        :param X:
        :param y:
        :param params:
        :return:
        """
        # local variables
        Xt = []
        transformed_labels = []

        for label, prop in self.proportion_classes.items():
            size = int(prop * X.shape[0])
            Xt.append(self.random_generator.choice(X[y == label], size=size, replace=True))
            transformed_labels += [label] * size

        Xt = np.vstack(Xt)
        yt = np.array(transformed_labels)

        return Xt, yt

__init__(random_generator=default_rng(seed=0), proportion_classes=None)

:param random_generator: A random generator :param proportion_classes: Example for having in total 50% of class 'A', 30% of class 'B', and 20% of class 'C' proportion_classes={'A':0.5, 'B':0.3, 'C':0.2}

Source code in badgers/generators/tabular_data/imbalance.py
61
62
63
64
65
66
67
68
69
70
def __init__(self, random_generator=default_rng(seed=0), proportion_classes: dict = None):
    """

    :param random_generator: A random generator
    :param proportion_classes: Example for having in total 50% of class 'A', 30% of class 'B', and 20% of class 'C'
        proportion_classes={'A':0.5, 'B':0.3, 'C':0.2}
    """
    super().__init__(random_generator=random_generator)
    self.transformed_labels_ = None
    self.proportion_classes = proportion_classes

generate(X, y, **params)

Randomly samples instances for each classes

:param X: :param y: :param params: :return:

Source code in badgers/generators/tabular_data/imbalance.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@numpy_API
def generate(self, X, y, **params):
    """
    Randomly samples instances for each classes

    :param X:
    :param y:
    :param params:
    :return:
    """
    # local variables
    Xt = []
    transformed_labels = []

    for label, prop in self.proportion_classes.items():
        size = int(prop * X.shape[0])
        Xt.append(self.random_generator.choice(X[y == label], size=size, replace=True))
        transformed_labels += [label] * size

    Xt = np.vstack(Xt)
    yt = np.array(transformed_labels)

    return Xt, yt

RandomSamplingFeaturesGenerator

Bases: ImbalanceGenerator

Source code in badgers/generators/tabular_data/imbalance.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
class RandomSamplingFeaturesGenerator(ImbalanceGenerator):

    def __init__(self, random_generator=default_rng(seed=0), sampling_proba_func=lambda X: normalize_proba(X[:, 0])):
        """

        :param random_generator: A random generator
        :param sampling_proba_func: A function that takes as input data and returns a sampling probability
        """
        super().__init__(random_generator=random_generator)
        self.sampling_proba_func = sampling_proba_func

    @numpy_API
    def generate(self, X, y=None, **params):
        """
        Randomly samples instances based on the features values in X

        :param X:
        :param y:
        :return: Xt, yt
        """
        # total number of instances that will be missing
        # sampling
        sampling_proba = self.sampling_proba_func(X)
        sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_proba, size=X.shape[0], replace=True)
        Xt = X[sampling_mask]
        yt = y[sampling_mask] if y is not None else y
        return Xt, yt

__init__(random_generator=default_rng(seed=0), sampling_proba_func=lambda X: normalize_proba(X[:, 0]))

:param random_generator: A random generator :param sampling_proba_func: A function that takes as input data and returns a sampling probability

Source code in badgers/generators/tabular_data/imbalance.py
29
30
31
32
33
34
35
36
def __init__(self, random_generator=default_rng(seed=0), sampling_proba_func=lambda X: normalize_proba(X[:, 0])):
    """

    :param random_generator: A random generator
    :param sampling_proba_func: A function that takes as input data and returns a sampling probability
    """
    super().__init__(random_generator=random_generator)
    self.sampling_proba_func = sampling_proba_func

generate(X, y=None, **params)

Randomly samples instances based on the features values in X

:param X: :param y: :return: Xt, yt

Source code in badgers/generators/tabular_data/imbalance.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
@numpy_API
def generate(self, X, y=None, **params):
    """
    Randomly samples instances based on the features values in X

    :param X:
    :param y:
    :return: Xt, yt
    """
    # total number of instances that will be missing
    # sampling
    sampling_proba = self.sampling_proba_func(X)
    sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_proba, size=X.shape[0], replace=True)
    Xt = X[sampling_mask]
    yt = y[sampling_mask] if y is not None else y
    return Xt, yt

RandomSamplingTargetsGenerator

Bases: ImbalanceGenerator

Randomly samples data points

Source code in badgers/generators/tabular_data/imbalance.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class RandomSamplingTargetsGenerator(ImbalanceGenerator):
    """
    Randomly samples data points
    """

    def __init__(self, random_generator=default_rng(seed=0), sampling_proba_func=lambda y: normalize_proba(y)):
        """

        :param random_generator: A random generator
        :param sampling_proba_func: A function that takes y as input and returns a sampling probability
        """
        super().__init__(random_generator=random_generator)
        self.transformed_labels_ = None
        self.sampling_proba_func = sampling_proba_func

    @numpy_API
    def generate(self, X, y, **params):
        """
        Randomly samples instances for each classes

        :param X:
        :param y:
        :return:
        """
        sampling_probabilities_ = self.sampling_proba_func(y)
        sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_probabilities_, size=X.shape[0],
                                                     replace=True)

        Xt = X[sampling_mask, :]
        yt = y[sampling_mask]

        return Xt, yt

__init__(random_generator=default_rng(seed=0), sampling_proba_func=lambda y: normalize_proba(y))

:param random_generator: A random generator :param sampling_proba_func: A function that takes y as input and returns a sampling probability

Source code in badgers/generators/tabular_data/imbalance.py
102
103
104
105
106
107
108
109
110
def __init__(self, random_generator=default_rng(seed=0), sampling_proba_func=lambda y: normalize_proba(y)):
    """

    :param random_generator: A random generator
    :param sampling_proba_func: A function that takes y as input and returns a sampling probability
    """
    super().__init__(random_generator=random_generator)
    self.transformed_labels_ = None
    self.sampling_proba_func = sampling_proba_func

generate(X, y, **params)

Randomly samples instances for each classes

:param X: :param y: :return:

Source code in badgers/generators/tabular_data/imbalance.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
@numpy_API
def generate(self, X, y, **params):
    """
    Randomly samples instances for each classes

    :param X:
    :param y:
    :return:
    """
    sampling_probabilities_ = self.sampling_proba_func(y)
    sampling_mask = self.random_generator.choice(X.shape[0], p=sampling_probabilities_, size=X.shape[0],
                                                 replace=True)

    Xt = X[sampling_mask, :]
    yt = y[sampling_mask]

    return Xt, yt