Skip to content

typos

LeetSpeakGenerator

Bases: TyposGenerator

Source code in badgers/generators/text/typos.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class LeetSpeakGenerator(TyposGenerator):

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        Initialize the LeetSpeakGenerator with a given random number generator.

        :param random_generator: A random number generator used to introduce randomness in leetspeak transformation.
        :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
        """
        super().__init__(random_generator=random_generator)
        self.leet_speak_mapping = {
            "A": ["4", "/\\", "@", "/-\\", "^", "(L", "\u0414"],
            "B": ["I3", "8", "13", "|3", "\u00df", "!3", "(3", "/3", ")3", "|-]", "j3"],
            "C": ["[", "\u00a2", "<", "(", "\u00a9"],
            "D": [")", "|)", "(|", "[)", "I>", "|>", "?", "T)", "I7", "cl", "|}", "|]"],
            "E": ["3", "&", "\u00a3", "\u20ac", "[-", "|=-"],
            "F": ["|=", "\u0192", "|#", "ph", "/=", "v"],
            "G": ["6", "&", "(_+", "9", "C-", "gee", "(?,", "[,", "{,", "<-", "(."],
            "H": ["#", "/-/", "\\-\\", "[-]", "]-[", ")-(", "(-)", ":-:", "|~|", "|-|", "]~[", "}{", "!-!", "1-1",
                  "\\-/", "I+I", "?"],
            "I": ["1", "|", "][", "!", "eye", "3y3"],
            "J": [",_|", "_|", "._|", "._]", "_]", ",_]", "]"],
            "K": [">|", "|<", "1<", "|c", "|(7<"],
            "L": ["1", "2", "\u00a3", "7", "|_", "|"],
            "M": ["/\\/\\", "/V\\", "[V]", "|\\/|", "^^", "<\\/>", "{V}", "(v)", "(V)", "|\\|\\", "]\\/[", "nn", "11"],
            "N": ["^/", "|\\|", "/\\/", "[\\]", "<\\>", "{\\}", "/V", "^", "\u0e17", "\u0418"],
            "O": ["0", "()", "oh", "[]", "p", "<>", "\u00d8"],
            "P": ["|*", "|o", "|\u00ba", "?", "|^", "|>", "|\"", "9", "[]D", "|\u00b0", "|7"],
            "Q": ["(_,)", "()_", "2", "0_", "<|", "&", "9", "\u00b6", "\u204b", "\u2117"],
            "R": ["I2", "9", "|`", "|~", "|?", "/2", "|^", "lz", "7", "2", "12", "\u00ae", "[z", "\u042f", ".-", "|2",
                  "|-", "3"],
            "S": ["5", "$", "z", "\u00a7", "ehs", "es", "2"],
            "T": ["7", "+", "-|-", "']['", "\u2020", "\u00ab|\u00bb", "~|~"],
            "U": ["(_)", "|_|", "v", "L|", "\u0e1a"],
            "V": ["\\/", "|/", "\\|"],
            "W": ["\\/\\/", "vv", "\\N", "'//", "\\\\'", "\\^/", "\\/\\/", "(n)", "\\V/", "\\X/", "\\|/", "\\_|_/",
                  "\\_:_/", "uu", "2u", "\\\\//\\\\//", "\u0e1e", "\u20a9"],
            "X": ["><", "}{", "ecks", "\u00d7", "?", "}{", ")(", "]["],
            "Y": ["j", "`/", "\\|/", "\u00a5", "\\//"],
            "Z": ["2", "7_", "-/_", "%", ">_", "s", "~/_", "-\\_", "-|_"]
        }

    def randomly_replace_letter(self, letter, replacement_proba):
        """
        Randomly replace a letter with its leet counterpart based on the provided probability.

        :param letter: The letter to potentially replace.
        :type letter: str
        :param replacement_proba: The probability of replacing the letter with its leet counterpart.
        :type replacement_proba: float
        :return: The replaced letter if a random draw is less than or equal to the replacement_proba, otherwise the original letter.
        :rtype: str
        """
        if letter.upper() in self.leet_speak_mapping:
            if self.random_generator.random() < replacement_proba:
                letter = self.random_generator.choice(self.leet_speak_mapping[letter.upper()])

        return letter

    def generate(self, X, y, replacement_proba: float = 0.1) -> Tuple:
        """
        Apply leet speak transformation to a list of words.

        :param X: A list of words where leet speak transformation is applied.
        :param y: The labels associated with the words, which remain unchanged.
        :param replacement_proba: The probability of replacing a letter with its leet counterpart.
                                  This probability applies to each letter in each word independently.
        :return: A tuple containing the transformed list of words and the original labels `y` (unchanged).
        """
        transformed_X = []
        for word in X:
            transformed_word = ''.join(
                self.randomly_replace_letter(letter, replacement_proba) for letter in word
            )
            transformed_X.append(transformed_word)

        return transformed_X, y
        assert 0 <= replacement_proba <= 1
        Xt = [
            ''.join([self.randomly_replace_letter(l, replacement_proba=replacement_proba) for l in word])
            for word in X
        ]

        return Xt, y

__init__(random_generator=default_rng(seed=0))

Initialize the LeetSpeakGenerator with a given random number generator.

Parameters:

Name Type Description Default
random_generator numpy.random.Generator, default=default_rng(seed=0)

A random number generator used to introduce randomness in leetspeak transformation.

default_rng(seed=0)
Source code in badgers/generators/text/typos.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def __init__(self, random_generator=default_rng(seed=0)):
    """
    Initialize the LeetSpeakGenerator with a given random number generator.

    :param random_generator: A random number generator used to introduce randomness in leetspeak transformation.
    :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
    """
    super().__init__(random_generator=random_generator)
    self.leet_speak_mapping = {
        "A": ["4", "/\\", "@", "/-\\", "^", "(L", "\u0414"],
        "B": ["I3", "8", "13", "|3", "\u00df", "!3", "(3", "/3", ")3", "|-]", "j3"],
        "C": ["[", "\u00a2", "<", "(", "\u00a9"],
        "D": [")", "|)", "(|", "[)", "I>", "|>", "?", "T)", "I7", "cl", "|}", "|]"],
        "E": ["3", "&", "\u00a3", "\u20ac", "[-", "|=-"],
        "F": ["|=", "\u0192", "|#", "ph", "/=", "v"],
        "G": ["6", "&", "(_+", "9", "C-", "gee", "(?,", "[,", "{,", "<-", "(."],
        "H": ["#", "/-/", "\\-\\", "[-]", "]-[", ")-(", "(-)", ":-:", "|~|", "|-|", "]~[", "}{", "!-!", "1-1",
              "\\-/", "I+I", "?"],
        "I": ["1", "|", "][", "!", "eye", "3y3"],
        "J": [",_|", "_|", "._|", "._]", "_]", ",_]", "]"],
        "K": [">|", "|<", "1<", "|c", "|(7<"],
        "L": ["1", "2", "\u00a3", "7", "|_", "|"],
        "M": ["/\\/\\", "/V\\", "[V]", "|\\/|", "^^", "<\\/>", "{V}", "(v)", "(V)", "|\\|\\", "]\\/[", "nn", "11"],
        "N": ["^/", "|\\|", "/\\/", "[\\]", "<\\>", "{\\}", "/V", "^", "\u0e17", "\u0418"],
        "O": ["0", "()", "oh", "[]", "p", "<>", "\u00d8"],
        "P": ["|*", "|o", "|\u00ba", "?", "|^", "|>", "|\"", "9", "[]D", "|\u00b0", "|7"],
        "Q": ["(_,)", "()_", "2", "0_", "<|", "&", "9", "\u00b6", "\u204b", "\u2117"],
        "R": ["I2", "9", "|`", "|~", "|?", "/2", "|^", "lz", "7", "2", "12", "\u00ae", "[z", "\u042f", ".-", "|2",
              "|-", "3"],
        "S": ["5", "$", "z", "\u00a7", "ehs", "es", "2"],
        "T": ["7", "+", "-|-", "']['", "\u2020", "\u00ab|\u00bb", "~|~"],
        "U": ["(_)", "|_|", "v", "L|", "\u0e1a"],
        "V": ["\\/", "|/", "\\|"],
        "W": ["\\/\\/", "vv", "\\N", "'//", "\\\\'", "\\^/", "\\/\\/", "(n)", "\\V/", "\\X/", "\\|/", "\\_|_/",
              "\\_:_/", "uu", "2u", "\\\\//\\\\//", "\u0e1e", "\u20a9"],
        "X": ["><", "}{", "ecks", "\u00d7", "?", "}{", ")(", "]["],
        "Y": ["j", "`/", "\\|/", "\u00a5", "\\//"],
        "Z": ["2", "7_", "-/_", "%", ">_", "s", "~/_", "-\\_", "-|_"]
    }

generate(X, y, replacement_proba=0.1)

Apply leet speak transformation to a list of words.

Parameters:

Name Type Description Default
X

A list of words where leet speak transformation is applied.

required
y

The labels associated with the words, which remain unchanged.

required
replacement_proba float

The probability of replacing a letter with its leet counterpart. This probability applies to each letter in each word independently.

0.1

Returns:

Type Description
Tuple

A tuple containing the transformed list of words and the original labels y (unchanged).

Source code in badgers/generators/text/typos.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def generate(self, X, y, replacement_proba: float = 0.1) -> Tuple:
    """
    Apply leet speak transformation to a list of words.

    :param X: A list of words where leet speak transformation is applied.
    :param y: The labels associated with the words, which remain unchanged.
    :param replacement_proba: The probability of replacing a letter with its leet counterpart.
                              This probability applies to each letter in each word independently.
    :return: A tuple containing the transformed list of words and the original labels `y` (unchanged).
    """
    transformed_X = []
    for word in X:
        transformed_word = ''.join(
            self.randomly_replace_letter(letter, replacement_proba) for letter in word
        )
        transformed_X.append(transformed_word)

    return transformed_X, y
    assert 0 <= replacement_proba <= 1
    Xt = [
        ''.join([self.randomly_replace_letter(l, replacement_proba=replacement_proba) for l in word])
        for word in X
    ]

    return Xt, y

randomly_replace_letter(letter, replacement_proba)

Randomly replace a letter with its leet counterpart based on the provided probability.

Parameters:

Name Type Description Default
letter str

The letter to potentially replace.

required
replacement_proba float

The probability of replacing the letter with its leet counterpart.

required

Returns:

Type Description
str

The replaced letter if a random draw is less than or equal to the replacement_proba, otherwise the original letter.

Source code in badgers/generators/text/typos.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def randomly_replace_letter(self, letter, replacement_proba):
    """
    Randomly replace a letter with its leet counterpart based on the provided probability.

    :param letter: The letter to potentially replace.
    :type letter: str
    :param replacement_proba: The probability of replacing the letter with its leet counterpart.
    :type replacement_proba: float
    :return: The replaced letter if a random draw is less than or equal to the replacement_proba, otherwise the original letter.
    :rtype: str
    """
    if letter.upper() in self.leet_speak_mapping:
        if self.random_generator.random() < replacement_proba:
            letter = self.random_generator.choice(self.leet_speak_mapping[letter.upper()])

    return letter

SwapCaseGenerator

Bases: TyposGenerator

Source code in badgers/generators/text/typos.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
class SwapCaseGenerator(TyposGenerator):

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        Initialize the SwapCaseGenerator with a given random number generator.

        :param random_generator: A random number generator used to introduce randomness in case swapping.
        :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
        """
        super().__init__(random_generator)

    def randomly_swapcase_letter(self, letter, swapcase_proba):
        """
        Randomly swap the case of a letter based on the provided probability.

        :param letter: The letter whose case may be swapped.
        :type letter: str
        :param swapcase_proba: The probability of swapping the case of the letter.
        :type swapcase_proba: float
        :return: The letter with swapped case if a random draw is less than or equal to the swapcase_proba, otherwise the original letter.
        :rtype: str
        """
        if self.random_generator.random() < swapcase_proba:
            letter = letter.swapcase()

        return letter

    def generate(self, X, y, swapcase_proba: float = 0.1) -> Tuple:
        """
        Apply random case swapping to each letter in a list of words.

        :param X: A list of words where random case swapping is applied.
        :param y: The labels associated with the words, which remain unchanged.
        :param swapcase_proba: The probability of swapping the case of each letter.
                               This probability applies to each letter in each word independently.
        :return: A tuple containing the transformed list of words and the original labels `y` (unchanged).
        """
        assert 0 <= swapcase_proba <= 1
        Xt = [
            ''.join([self.randomly_swapcase_letter(l, swapcase_proba=swapcase_proba) for l in word])
            for word in X
        ]

        return Xt, y

__init__(random_generator=default_rng(seed=0))

Initialize the SwapCaseGenerator with a given random number generator.

Parameters:

Name Type Description Default
random_generator numpy.random.Generator, default=default_rng(seed=0)

A random number generator used to introduce randomness in case swapping.

default_rng(seed=0)
Source code in badgers/generators/text/typos.py
154
155
156
157
158
159
160
161
def __init__(self, random_generator=default_rng(seed=0)):
    """
    Initialize the SwapCaseGenerator with a given random number generator.

    :param random_generator: A random number generator used to introduce randomness in case swapping.
    :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
    """
    super().__init__(random_generator)

generate(X, y, swapcase_proba=0.1)

Apply random case swapping to each letter in a list of words.

Parameters:

Name Type Description Default
X

A list of words where random case swapping is applied.

required
y

The labels associated with the words, which remain unchanged.

required
swapcase_proba float

The probability of swapping the case of each letter. This probability applies to each letter in each word independently.

0.1

Returns:

Type Description
Tuple

A tuple containing the transformed list of words and the original labels y (unchanged).

Source code in badgers/generators/text/typos.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def generate(self, X, y, swapcase_proba: float = 0.1) -> Tuple:
    """
    Apply random case swapping to each letter in a list of words.

    :param X: A list of words where random case swapping is applied.
    :param y: The labels associated with the words, which remain unchanged.
    :param swapcase_proba: The probability of swapping the case of each letter.
                           This probability applies to each letter in each word independently.
    :return: A tuple containing the transformed list of words and the original labels `y` (unchanged).
    """
    assert 0 <= swapcase_proba <= 1
    Xt = [
        ''.join([self.randomly_swapcase_letter(l, swapcase_proba=swapcase_proba) for l in word])
        for word in X
    ]

    return Xt, y

randomly_swapcase_letter(letter, swapcase_proba)

Randomly swap the case of a letter based on the provided probability.

Parameters:

Name Type Description Default
letter str

The letter whose case may be swapped.

required
swapcase_proba float

The probability of swapping the case of the letter.

required

Returns:

Type Description
str

The letter with swapped case if a random draw is less than or equal to the swapcase_proba, otherwise the original letter.

Source code in badgers/generators/text/typos.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def randomly_swapcase_letter(self, letter, swapcase_proba):
    """
    Randomly swap the case of a letter based on the provided probability.

    :param letter: The letter whose case may be swapped.
    :type letter: str
    :param swapcase_proba: The probability of swapping the case of the letter.
    :type swapcase_proba: float
    :return: The letter with swapped case if a random draw is less than or equal to the swapcase_proba, otherwise the original letter.
    :rtype: str
    """
    if self.random_generator.random() < swapcase_proba:
        letter = letter.swapcase()

    return letter

SwapLettersGenerator

Bases: TyposGenerator

Swap adjacent letters in words randomly except for the first and the last letters. Example: 'kilogram' --> 'kilogarm'

Source code in badgers/generators/text/typos.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class SwapLettersGenerator(TyposGenerator):
    """
    Swap adjacent letters in words randomly except for the first and the last letters.
    Example: 'kilogram' --> 'kilogarm'
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        Initialize the SwapLettersGenerator with a given random number generator.

        :param random_generator: A random number generator used to introduce randomness in letter swapping.
        :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
        """
        super().__init__(random_generator)

    def generate(self, X, y, swap_proba:float=0.1) -> Tuple:
        """
        For each word with a length greater than 3, apply a single swap with probability `swap_proba`.
        The position of the swap is chosen randomly among possible adjacent pairs of letters,
        excluding the first and last letters of the word.
        :param X: A list of words where typos are introduced.
        :param y: Not used in this method.
        :param swap_proba: Probability that a word with more than 3 characters will have one adjacent pair of letters swapped.
                           This probability applies to each eligible word independently.
        :return: A tuple containing the transformed list of words and the original labels `y` (unchanged).
        """
        for i in range(len(X)):
            if len(X[i]) > 3 and self.random_generator.random() <= swap_proba:
                # get the ith word in the list and make it a list of letters
                word = list(X[i])
                # randomly chose letters to switch
                idx = self.random_generator.integers(1, len(word) - 2)
                word[idx], word[idx + 1] = word[idx + 1], word[idx]
                # save the word with switched letters as string
                X[i] = ''.join(word)

        return X, y

__init__(random_generator=default_rng(seed=0))

Initialize the SwapLettersGenerator with a given random number generator.

Parameters:

Name Type Description Default
random_generator numpy.random.Generator, default=default_rng(seed=0)

A random number generator used to introduce randomness in letter swapping.

default_rng(seed=0)
Source code in badgers/generators/text/typos.py
34
35
36
37
38
39
40
41
def __init__(self, random_generator=default_rng(seed=0)):
    """
    Initialize the SwapLettersGenerator with a given random number generator.

    :param random_generator: A random number generator used to introduce randomness in letter swapping.
    :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
    """
    super().__init__(random_generator)

generate(X, y, swap_proba=0.1)

For each word with a length greater than 3, apply a single swap with probability swap_proba. The position of the swap is chosen randomly among possible adjacent pairs of letters, excluding the first and last letters of the word.

Parameters:

Name Type Description Default
X

A list of words where typos are introduced.

required
y

Not used in this method.

required
swap_proba float

Probability that a word with more than 3 characters will have one adjacent pair of letters swapped. This probability applies to each eligible word independently.

0.1

Returns:

Type Description
Tuple

A tuple containing the transformed list of words and the original labels y (unchanged).

Source code in badgers/generators/text/typos.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def generate(self, X, y, swap_proba:float=0.1) -> Tuple:
    """
    For each word with a length greater than 3, apply a single swap with probability `swap_proba`.
    The position of the swap is chosen randomly among possible adjacent pairs of letters,
    excluding the first and last letters of the word.
    :param X: A list of words where typos are introduced.
    :param y: Not used in this method.
    :param swap_proba: Probability that a word with more than 3 characters will have one adjacent pair of letters swapped.
                       This probability applies to each eligible word independently.
    :return: A tuple containing the transformed list of words and the original labels `y` (unchanged).
    """
    for i in range(len(X)):
        if len(X[i]) > 3 and self.random_generator.random() <= swap_proba:
            # get the ith word in the list and make it a list of letters
            word = list(X[i])
            # randomly chose letters to switch
            idx = self.random_generator.integers(1, len(word) - 2)
            word[idx], word[idx + 1] = word[idx + 1], word[idx]
            # save the word with switched letters as string
            X[i] = ''.join(word)

    return X, y

TyposGenerator

Bases: GeneratorMixin

Base class for transformers creating typos in a list of words

Source code in badgers/generators/text/typos.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class TyposGenerator(GeneratorMixin):
    """
    Base class for transformers creating typos in a list of words
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """
        Initialize the TyposGenerator with a given random number generator.

        :param random_generator: A random number generator used to introduce randomness in typo generation.
        :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
        """
        self.random_generator = random_generator

    @abc.abstractmethod
    def generate(self, X, y, **params) -> Tuple:
        pass

__init__(random_generator=default_rng(seed=0))

Initialize the TyposGenerator with a given random number generator.

Parameters:

Name Type Description Default
random_generator numpy.random.Generator, default=default_rng(seed=0)

A random number generator used to introduce randomness in typo generation.

default_rng(seed=0)
Source code in badgers/generators/text/typos.py
14
15
16
17
18
19
20
21
def __init__(self, random_generator=default_rng(seed=0)):
    """
    Initialize the TyposGenerator with a given random number generator.

    :param random_generator: A random number generator used to introduce randomness in typo generation.
    :type random_generator: numpy.random.Generator, default=default_rng(seed=0)
    """
    self.random_generator = random_generator