Source code for charex.denormal

"""
denormal
~~~~~~~~

Functions for reversing normalization of string.
"""
from collections.abc import Generator, Sequence
from math import prod
from random import choice, seed

from charex.charex import Character


# Functions.
[docs] def count_denormalizations( base: str, form: str, maxdepth: int | None = None ) -> int: """Determine the number of denormalizations that exist for the string. :param base: The :class:`str` to denormalize. :param form: The Unicode normalization form to denormalize from. Valid values are: casefold, nfc, nfd, nfkc, nfkd. :param maxdepth: (Optional.) How many individual characters to use when denormalizing the base. This is used to limit the total number of denormalizations of the overall base. :return: The number of denormalizations as an :class:`int`. :rtype: int :usage: To count the number of possible denormalizations for a given string and form: >>> base = '<->' >>> form = 'nfkc' >>> count_denormalizations(base, form) 8 """ chars = (Character(c) for c in base) counts = [] for char in chars: count = len(char.denormalize(form)) if count == 0: count = 1 if maxdepth and count > maxdepth: count = maxdepth counts.append(count) return int(prod(counts))
[docs] def denormalize( base: str, form: str, maxdepth: int = 0, maxresults: int | None = None, random: bool = False, seed_: bytes | int | str = '' ) -> tuple[str, ...]: """Denormalize a string. :param base: The :class:`str` to denormalize. :param form: The Unicode normalization form to denormalize from. Valid values are: casefold, nfc, nfd, nfkc, nfkd. :param maxdepth: (Optional.) How many denormalizations per character in the base string to use when denormalizing the base. This is used to limit the total number of denormalizations of the overall base. If `maxdepth` is zero, the number of denormalizations to use per character is not limited. :param maxresults: (Optional.) The maximum number of results to return. Default behavior varies based on the `random` parameter. If `random` is `False`, default is to return all possible denormalizattions. Otherwise, the default is to return one. :param random: (Optional.) Whether to pick randomly from the possible denormalization results. Defaults to false. :param seed: (Optional.) A seed value for the random number generator. Defaults to not seeding the generator. :return: The denormalizations as a :class:`tuple`. :rtype: tuple :usage: To denormalize a given string with the given form:: >>> base = '<>' >>> form = 'nfkc' >>> denormalize(base, form) ('﹤﹥', '﹤>', '<﹥', '<>') The `maxdepth` parameter can be used to limit the number of denormalizations per character in the `base` string. This is useful when you want just a few denormalizations of a string with a very large number of denormalizations:: >>> base = 'hi' >>> form = 'nfkc' >>> maxdepth = 2 >>> denormalize(base, form, maxdepth) ('ʰᵢ', 'ʰⁱ', 'ₕᵢ', 'ₕⁱ') """ if random: return random_denormalize(base, form, maxresults, seed_) # Get the denormalized forms of the first character. char = Character(base[0]) dechars = list(char.denormalize(form)) # If there are no denormalized forms, then it is the denormalized form. if not dechars: dechars = [char.value,] # Limit the number of permutations by limiting the number of # denormalized forms we are looking at. if maxdepth and len(dechars) > maxdepth: dechars = dechars[:maxdepth] # If there are more characters left, use recursion to get the # permutations for those characters, then get the permutations # with this character. if base[1:]: results = [] tails = denormalize(base[1:], form, maxdepth, maxresults) for dechar in dechars: for tail in tails: results.append(dechar + tail) # If there are no characters left, then the permutations are just # the denormalized forms of the character. else: results = dechars # Truncate to the maximum results and return. if maxresults: results = results[:maxresults] return tuple(results)
[docs] def gen_denormalize( base: str, form: str, maxdepth: int = 0 ) -> Generator[str, None, None]: """Denormalize a string, yielding the results as they are generated. :param base: The :class:`str` to denormalize. :param form: The Unicode normalization form to denormalize from. Valid values are: casefold, nfc, nfd, nfkc, nfkd. :param maxdepth: (Optional.) How many denormalizations per character in the base string to use when denormalizing the base. This is used to limit the total number of denormalizations of the overall base. If `maxdepth` is zero, the number of denormalizations to use per character is not limited. :return: A :class:`collections.abc.Generator` that yields the denormalization results. :rtype: collections.abc.Generator :usage: To generate denormalizations for a given string with a given form: >>> base = '<>' >>> form = 'nfkc' >>> dngen = gen_denormalize(base, form) >>> [result for result in dngen] ['﹤﹥', '﹤>', '<﹥', '<>'] The `maxdepth` parameter can be used to limit the number of denormalizations per character in the `base` string. This is useful when you want just a few denormalizations of a string with a very large number of denormalizations: >>> base = 'hi' >>> form = 'nfkc' >>> maxdepth = 2 >>> dngen = gen_denormalize(base, form, maxdepth) >>> [result for result in dngen] ['ʰᵢ', 'ʰⁱ', 'ₕᵢ', 'ₕⁱ'] """ c, rest = base[0], base[1:] char = Character(c) dechars = char.denormalize(form) if not dechars: dechars = (char.value,) if maxdepth: dechars = dechars[:maxdepth] if rest: for dechar in dechars: for tail in gen_denormalize(rest, form, maxdepth): yield dechar + tail else: for dechar in dechars: yield dechar
[docs] def gen_random_denormalize( base: str, form: str, maxresults: int = 1, seed_: bytes | int | str = '' ) -> Generator[str, None, None]: """Randomly denormalize a string, yielding the results as they are generated. This is useful when returning all results for a denormalization is unreasonably large, as can easily happen when denormalizing strings containing Latin letters. :param base: The :class:`str` to denormalize. :param form: The Unicode normalization for to denormalize from. Valid values are: NFC, NFD, NFKC, NFKD. :param maxresults: (Optional.) The maximum number of results to return. The default is to return one. :param seed: (Optional.) A seed value for the random number generator. Defaults to not seeding the generator. :return: A :class:`collections.abc.Generator` that yields the random denormalization results. :rtype: collections.abc.Generator :usage: To generate a random denormalization of a given string with a given form: .. testsetup:: gen_random_denormalize_1 from charex.denormal import gen_random_denormalize, seed seed('spam') .. doctest:: gen_random_denormalize_1 >>> base = '<script>' >>> form = 'nfkc' >>> dngrd = gen_random_denormalize(base, form) >>> [result for result in dngrd] ['﹤𝓈ᶜ𝕣𝚒𝙥𝙩>'] The `maxresults` parameter tells the generator to return the given number of results: .. testsetup:: gen_random_denormalize_2 from charex.denormal import gen_random_denormalize, seed seed('spam') .. doctest:: gen_random_denormalize_2 >>> base = '<script>' >>> form = 'nfkc' >>> maxresults = 3 >>> dngrd = gen_random_denormalize(base, form, maxresults) >>> [result for result in dngrd] ['﹤𝓈ᶜ𝕣𝚒𝙥𝙩>', '<𝖘ᶜ𝓇𝕚ᵖ𝓉>', '﹤𝙨𝚌𝑟𝗂𝐩t>'] """ chars = [denormalize(char, form) for char in base] if seed_: seed(seed_) for _ in range(maxresults): result = ''.join(choice(char) for char in chars) yield result
def random_denormalize( base: str, form: str, maxresults: int | None = None, seed_: bytes | int | str = '' ) -> tuple[str, ...]: """Randomly denormalize a string. This is useful when returning all results for a denormalization is unreasonably large, as can easily happen when denormalizing strings containing Latin letters. :param base: The :class:`str` to denormalize. :param form: The Unicode normalization for to denormalize from. Valid values are: NFC, NFD, NFKC, NFKD. :param maxresults: (Optional.) The maximum number of results to return. Default behavior varies based on the `random` parameter. If `random` is `False`, default is to return all possible denormalizattions. Otherwise, the default is to return one. :param seed: (Optional.) A seed value for the random number generator. Defaults to not seeding the generator. :return: The denormalizations as a :class:`tuple`. :rtype: tuple :usage: To get a random denormalization of the given string using the given form: >>> # The seed parameter seeds the RNG to produce repeatable >>> # results for testing. Don't use it unless you want >>> # repeatable results. >>> seed_ = 'spam' >>> >>> base = '<script>' >>> form = 'nfkc' >>> random_denormalize(base, form, seed_=seed_) ('﹤𝓈ᶜ𝕣𝚒𝙥𝙩>',) The `maxresults` parameter tells the function to return the given number of results: >>> # The seed parameter seeds the RNG to produce repeatable >>> # results for testing. Don't use it unless you want >>> # repeatable results. >>> seed_ = 'spam' >>> >>> base = '<script>' >>> form = 'nfkc' >>> maxresults = 3 >>> random_denormalize(base, form, maxresults, seed_=seed_) ('﹤𝓈ᶜ𝕣𝚒𝙥𝙩>', '<𝖘ᶜ𝓇𝕚ᵖ𝓉>', '﹤𝙨𝚌𝑟𝗂𝐩t>') """ # Ensure at least one result is returned. if not maxresults: maxresults = 1 # Seeding the RNG allows for repeatability in testing. if seed_: seed(seed_) # Get the denormalized forms for all the characters in the string. chars = [ denormalize(char, form) for char in base ] # Randomly pick from the possible denormalizations for each character # when creating the denormalized strings, then return the results. results = [] for _ in range(maxresults): result = ''.join(choice(char) for char in chars) results.append(result) return tuple(results)