Skip to content

typos

LeetSpeakGenerator

Bases: TyposGenerator

Source code in badgers/generators/text/typos.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class LeetSpeakGenerator(TyposGenerator):

    def __init__(self, random_generator=default_rng(seed=0), replacement_proba: float = 0.1):
        """

        :param random_generator: a random number generator
        :param replacement_proba: the probability of replacing a letter with its leet counterpart
        """
        super().__init__(random_generator=random_generator)
        assert 0 <= replacement_proba <= 1
        self.replacement_proba = replacement_proba
        self.leet_speak_mapping = {
            "A": ["4", "/\\", "@", "/-\\", "^", "(L", "\u0414"],
            "B": ["I3", "8", "13", "|3", "\u00df", "!3", "(3", "/3", ")3", "|-]", "j3"],
            "C": ["[", "\u00a2", "<", "(", "\u00a9"],
            "D": [")", "|)", "(|", "[)", "I>", "|>", "?", "T)", "I7", "cl", "|}", "|]"],
            "E": ["3", "&", "\u00a3", "\u20ac", "[-", "|=-"],
            "F": ["|=", "\u0192", "|#", "ph", "/=", "v"],
            "G": ["6", "&", "(_+", "9", "C-", "gee", "(?,", "[,", "{,", "<-", "(."],
            "H": ["#", "/-/", "\\-\\", "[-]", "]-[", ")-(", "(-)", ":-:", "|~|", "|-|", "]~[", "}{", "!-!", "1-1",
                  "\\-/", "I+I", "?"],
            "I": ["1", "|", "][", "!", "eye", "3y3"],
            "J": [",_|", "_|", "._|", "._]", "_]", ",_]", "]"],
            "K": [">|", "|<", "1<", "|c", "|(7<"],
            "L": ["1", "2", "\u00a3", "7", "|_", "|"],
            "M": ["/\\/\\", "/V\\", "[V]", "|\\/|", "^^", "<\\/>", "{V}", "(v)", "(V)", "|\\|\\", "]\\/[", "nn", "11"],
            "N": ["^/", "|\\|", "/\\/", "[\\]", "<\\>", "{\\}", "/V", "^", "\u0e17", "\u0418"],
            "O": ["0", "()", "oh", "[]", "p", "<>", "\u00d8"],
            "P": ["|*", "|o", "|\u00ba", "?", "|^", "|>", "|\"", "9", "[]D", "|\u00b0", "|7"],
            "Q": ["(_,)", "()_", "2", "0_", "<|", "&", "9", "\u00b6", "\u204b", "\u2117"],
            "R": ["I2", "9", "|`", "|~", "|?", "/2", "|^", "lz", "7", "2", "12", "\u00ae", "[z", "\u042f", ".-", "|2",
                  "|-", "3"],
            "S": ["5", "$", "z", "\u00a7", "ehs", "es", "2"],
            "T": ["7", "+", "-|-", "']['", "\u2020", "\u00ab|\u00bb", "~|~"],
            "U": ["(_)", "|_|", "v", "L|", "\u0e1a"],
            "V": ["\\/", "|/", "\\|"],
            "W": ["\\/\\/", "vv", "\\N", "'//", "\\\\'", "\\^/", "\\/\\/", "(n)", "\\V/", "\\X/", "\\|/", "\\_|_/",
                  "\\_:_/", "uu", "2u", "\\\\//\\\\//", "\u0e1e", "\u20a9"],
            "X": ["><", "}{", "ecks", "\u00d7", "?", "}{", ")(", "]["],
            "Y": ["j", "`/", "\\|/", "\u00a5", "\\//"],
            "Z": ["2", "7_", "-/_", "%", ">_", "s", "~/_", "-\\_", "-|_"]
        }

    def randomly_replace_letter(self, letter):
        """
        Randomly replace a letter with its leet counterpart
        :param letter:
        :return:
        """
        if self.random_generator.random() < self.replacement_proba:
            return self.random_generator.choice(self.leet_speak_mapping[letter.upper()])
        else:
            return letter

    def generate(self, X, y, **params) -> Tuple:

        Xt = [
            ''.join([self.randomly_replace_letter(l) for l in word])
            for word in X
        ]

        return Xt, y

__init__(random_generator=default_rng(seed=0), replacement_proba=0.1)

:param random_generator: a random number generator :param replacement_proba: the probability of replacing a letter with its leet counterpart

Source code in badgers/generators/text/typos.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(self, random_generator=default_rng(seed=0), replacement_proba: float = 0.1):
    """

    :param random_generator: a random number generator
    :param replacement_proba: the probability of replacing a letter with its leet counterpart
    """
    super().__init__(random_generator=random_generator)
    assert 0 <= replacement_proba <= 1
    self.replacement_proba = replacement_proba
    self.leet_speak_mapping = {
        "A": ["4", "/\\", "@", "/-\\", "^", "(L", "\u0414"],
        "B": ["I3", "8", "13", "|3", "\u00df", "!3", "(3", "/3", ")3", "|-]", "j3"],
        "C": ["[", "\u00a2", "<", "(", "\u00a9"],
        "D": [")", "|)", "(|", "[)", "I>", "|>", "?", "T)", "I7", "cl", "|}", "|]"],
        "E": ["3", "&", "\u00a3", "\u20ac", "[-", "|=-"],
        "F": ["|=", "\u0192", "|#", "ph", "/=", "v"],
        "G": ["6", "&", "(_+", "9", "C-", "gee", "(?,", "[,", "{,", "<-", "(."],
        "H": ["#", "/-/", "\\-\\", "[-]", "]-[", ")-(", "(-)", ":-:", "|~|", "|-|", "]~[", "}{", "!-!", "1-1",
              "\\-/", "I+I", "?"],
        "I": ["1", "|", "][", "!", "eye", "3y3"],
        "J": [",_|", "_|", "._|", "._]", "_]", ",_]", "]"],
        "K": [">|", "|<", "1<", "|c", "|(7<"],
        "L": ["1", "2", "\u00a3", "7", "|_", "|"],
        "M": ["/\\/\\", "/V\\", "[V]", "|\\/|", "^^", "<\\/>", "{V}", "(v)", "(V)", "|\\|\\", "]\\/[", "nn", "11"],
        "N": ["^/", "|\\|", "/\\/", "[\\]", "<\\>", "{\\}", "/V", "^", "\u0e17", "\u0418"],
        "O": ["0", "()", "oh", "[]", "p", "<>", "\u00d8"],
        "P": ["|*", "|o", "|\u00ba", "?", "|^", "|>", "|\"", "9", "[]D", "|\u00b0", "|7"],
        "Q": ["(_,)", "()_", "2", "0_", "<|", "&", "9", "\u00b6", "\u204b", "\u2117"],
        "R": ["I2", "9", "|`", "|~", "|?", "/2", "|^", "lz", "7", "2", "12", "\u00ae", "[z", "\u042f", ".-", "|2",
              "|-", "3"],
        "S": ["5", "$", "z", "\u00a7", "ehs", "es", "2"],
        "T": ["7", "+", "-|-", "']['", "\u2020", "\u00ab|\u00bb", "~|~"],
        "U": ["(_)", "|_|", "v", "L|", "\u0e1a"],
        "V": ["\\/", "|/", "\\|"],
        "W": ["\\/\\/", "vv", "\\N", "'//", "\\\\'", "\\^/", "\\/\\/", "(n)", "\\V/", "\\X/", "\\|/", "\\_|_/",
              "\\_:_/", "uu", "2u", "\\\\//\\\\//", "\u0e1e", "\u20a9"],
        "X": ["><", "}{", "ecks", "\u00d7", "?", "}{", ")(", "]["],
        "Y": ["j", "`/", "\\|/", "\u00a5", "\\//"],
        "Z": ["2", "7_", "-/_", "%", ">_", "s", "~/_", "-\\_", "-|_"]
    }

randomly_replace_letter(letter)

Randomly replace a letter with its leet counterpart :param letter: :return:

Source code in badgers/generators/text/typos.py
107
108
109
110
111
112
113
114
115
116
def randomly_replace_letter(self, letter):
    """
    Randomly replace a letter with its leet counterpart
    :param letter:
    :return:
    """
    if self.random_generator.random() < self.replacement_proba:
        return self.random_generator.choice(self.leet_speak_mapping[letter.upper()])
    else:
        return letter

SwapLettersGenerator

Bases: TyposGenerator

Swap adjacent letters in words randomly except for the first and the last letters. Example: 'kilogram' --> 'kilogarm'

Source code in badgers/generators/text/typos.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
class SwapLettersGenerator(TyposGenerator):
    """
    Swap adjacent letters in words randomly except for the first and the last letters.
    Example: 'kilogram' --> 'kilogarm'
    """

    def __init__(self, random_generator=default_rng(seed=0), swap_proba=0.1):
        """

        :param random_generator: A random generator
        :param swap_proba: Each word with a length greater than 3 will have this probability to contain a switch (max one per word)

        """
        super().__init__(random_generator)
        self.switching_proba = swap_proba

    def generate(self, X, y, **params) -> Tuple:
        """
        For each word with a length greater than 3, apply a single swap with probability `self.swap_proba`
        Where the swap happens is determined randomly

        :param X: A list of words where we apply typos
        :return: the transformed list of words
        """
        for i in range(len(X)):
            if len(X[i]) > 3 and self.random_generator.random() <= self.switching_proba:
                # get the ith word in the list and make it a list of letters
                word = list(X[i])
                # randomly chose letters to switch
                idx = self.random_generator.integers(1, len(word) - 2)
                word[idx], word[idx + 1] = word[idx + 1], word[idx]
                # save the word with switched letters as string
                X[i] = ''.join(word)

        return X, y

__init__(random_generator=default_rng(seed=0), swap_proba=0.1)

:param random_generator: A random generator :param swap_proba: Each word with a length greater than 3 will have this probability to contain a switch (max one per word)

Source code in badgers/generators/text/typos.py
33
34
35
36
37
38
39
40
41
def __init__(self, random_generator=default_rng(seed=0), swap_proba=0.1):
    """

    :param random_generator: A random generator
    :param swap_proba: Each word with a length greater than 3 will have this probability to contain a switch (max one per word)

    """
    super().__init__(random_generator)
    self.switching_proba = swap_proba

generate(X, y, **params)

For each word with a length greater than 3, apply a single swap with probability self.swap_proba Where the swap happens is determined randomly

:param X: A list of words where we apply typos :return: the transformed list of words

Source code in badgers/generators/text/typos.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def generate(self, X, y, **params) -> Tuple:
    """
    For each word with a length greater than 3, apply a single swap with probability `self.swap_proba`
    Where the swap happens is determined randomly

    :param X: A list of words where we apply typos
    :return: the transformed list of words
    """
    for i in range(len(X)):
        if len(X[i]) > 3 and self.random_generator.random() <= self.switching_proba:
            # get the ith word in the list and make it a list of letters
            word = list(X[i])
            # randomly chose letters to switch
            idx = self.random_generator.integers(1, len(word) - 2)
            word[idx], word[idx + 1] = word[idx + 1], word[idx]
            # save the word with switched letters as string
            X[i] = ''.join(word)

    return X, y

TyposGenerator

Bases: GeneratorMixin

Base class for transformers creating typos in a list of words

Source code in badgers/generators/text/typos.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class TyposGenerator(GeneratorMixin):
    """
    Base class for transformers creating typos in a list of words
    """

    def __init__(self, random_generator=default_rng(seed=0)):
        """

        :param random_generator: numpy.random.Generator, default default_rng(seed=0)
            A random generator
        """
        self.random_generator = random_generator

    @abc.abstractmethod
    def generate(self, X, y, **params) -> Tuple:
        pass

__init__(random_generator=default_rng(seed=0))

:param random_generator: numpy.random.Generator, default default_rng(seed=0) A random generator

Source code in badgers/generators/text/typos.py
14
15
16
17
18
19
20
def __init__(self, random_generator=default_rng(seed=0)):
    """

    :param random_generator: numpy.random.Generator, default default_rng(seed=0)
        A random generator
    """
    self.random_generator = random_generator