Source code for charex.charex

"""
charex
~~~~~~

Tools for exploring unicode characters and other character sets.
"""
import re
import unicodedata as ucd
from collections.abc import Generator, Sequence
from typing import Literal, cast, overload

from charex import db, util
from charex.escape import schemes


# Global values.
normalization_forms = ['NFC', 'NFD', 'NFKC', 'NFKD']


# Common types.
NormForms = Literal['NFC', 'NFD', 'NFKC', 'NFKD']


# Exceptions.
class InvalidNormalizationFormError(ValueError):
    """The given string was not a valid normalization form."""


# Classes.
[docs] class Character: """A Unicode character. :param value: A character address string for the Unicode character. See below. :return: The character as a :class:`charex.Character`. :rtype: charex.Character :usage: To create a :class:`charex.Character` object for a single character string: >>> value = 'a' >>> char = Character(value) >>> char.value 'a' To create a :class:`charex.Character` object for a Unicode code point: >>> value = 'U+0061' >>> char = Character(value) >>> char.value 'a' To create a :class:`charex.Character` object for a binary string: >>> value = '0b01100001' >>> char = Character(value) >>> char.value 'a' To create a :class:`charex.Character` object for an octal string: >>> value = '0o141' >>> char = Character(value) >>> char.value 'a' To create a :class:`charex.Character` object for a decimal string: >>> value = '0d97' >>> char = Character(value) >>> char.value 'a' To create a :class:`charex.Character` object for a hex string: >>> value = '0x61' >>> char = Character(value) >>> char.value 'a' Beyond the declared properties and methods described below, most Unicode properties for the character are available by calling their alias as a property of :class:`charex.Character`: >>> value = 'a' >>> char = Character(value) >>> char.na 'LATIN SMALL LETTER A' >>> char.blk 'Basic Latin' >>> char.sc 'Latn' >>> char.suc '0041' :address formats: The understood str-based formats for manual input of addresses are: * Character: A string with length equal to one. * Code Point: The prefix "U+" followed by a hexadecimal number. * Binary String: The prefix "0b" followed by a binary number. * Hex String: The prefix "0x" followed by a hexadecimal number. The following formats are available for use through the API: * Bytes: A :class:`bytes`. * Integer: An :class:`int`. """ cache = db.cache def __init__(self, value: bytes | int | str) -> None: value = util.to_char(value) self.__value = value self._rev_normal_cache: dict[str, tuple[str, ...]] = {} def __getattr__(self, name): name = name.casefold() code = self.code_point[2:].casefold() return db.get_value_for_code(name, code) def __repr__(self) -> str: name = self.na if name == '<control>': name = f'<{self.na1}>' return f'{self.code_point} ({name})' # Derived properties. @property def code_point(self) -> str: """The address for the character in the Unicode database.""" return util.to_code(self.value, 'U+').upper() @property def value(self) -> str: """The Unicode character as a string.""" return self.__value # Public methods.
[docs] def denormalize(self, form: str) -> tuple[str, ...]: """Return the characters that normalize to the character using the given form. :param form: The normalization form to check against. :return: The denormalization results in a :class:`tuple`. :rtype: tuple :usage: To denormalize the character for the given form: >>> # Create the character object. >>> value = '<' >>> char = Character(value) >>> >>> # Get the denormalizations for the character. >>> form = 'nfkc' >>> char.denormalize(form) ('﹤', '<') """ prop = f'rev_{form}' code = self.code_point[2:].casefold() return db.get_denormal_map_for_code(prop, code)
[docs] def escape(self, scheme: str, codec: str = 'utf8') -> str: """The escaped version of the character. :param scheme: The escape scheme to use. :param codec: The codec to use when escaping to a hexadecimal string. :return: A :class:`str` with the escaped character. :rtype: str :usage: To escape the character with the given form: >>> value = '<' >>> char = Character(value) >>> >>> scheme = 'html' >>> char.escape(scheme) '&nvlt;' """ try: scheme = scheme.casefold() fn = schemes[scheme] return fn(self.value, codec) # UTF-16 surrogates will error when anything tries to # encode them as UTF-8. except UnicodeEncodeError: return ''
[docs] def encode(self, codec: str) -> str: """The hexadecimal value for the character in the given character set. :param codec: The codec to use when encoding to a hexadecimal string. :return: A :class:`str` with the encoded character. :rtype: str :usage: To encode the character with the given character set: >>> value = 'å' >>> char = Character(value) >>> >>> codec = 'utf8' >>> char.encode(codec) 'C3 A5' """ try: b = self.value.encode(codec) hexes = [f'{x:02x}'.upper() for x in b] return ' '.join(x for x in hexes) # UTF-16 surrogates will error when anything tries to # encode them as UTF-8. except UnicodeEncodeError: return ''
[docs] def is_normal(self, form: str) -> bool: """Is the character normalized to the given form? :param form: The normalization form to check against. :return: A :class:`bool` indicating whether the character is normalized. :rtype: bool :usage: To determine whether the character is already normalized for the given scheme. >>> value = 'å' >>> char = Character(value) >>> >>> form = 'nfc' >>> char.is_normal(form) True """ valid = validate_normalization_form(form) return ucd.is_normalized(valid, self.value)
[docs] def normalize(self, form: str) -> str: """Normalize the character using the given form. :param form: The normalization form to check against. :return: The normalization result as a :class:`str`. :rtype: str :usage: To normalize the character for the given form:: >>> value = '<' >>> char = Character(value) >>> >>> form = 'nfkc' >>> char.normalize(form) '<' """ valid = validate_normalization_form(form) return ucd.normalize(valid, self.value)
[docs] def summarize(self) -> str: """Return a summary of the character's information. :return: The character information as a :class:`str`. :rtype: str :usage: To summarize the character:: >>> value = 'å' >>> char = Character(value) >>> >>> char.summarize() 'å U+00E5 (LATIN SMALL LETTER A WITH RING ABOVE)' """ value = util.neutralize_control_characters(self.value) return f'{value} {self!r}'
# Utility functions.
[docs] def alias_property(longname: str, space: bool = True) -> str: """Translate the long name of a Unicode property into the alias for that property. :param longname: The long name for the property. :param space: (Optional.) Whether to replace spaces in the long name with underscores. Defaults to `True`. :return: The alias as a :class:`str`. :rtype: str :usage: To get the alias of a Unicode property: >>> longname = 'Case Folding' >>> alias_property(longname) 'cf' """ if space: longname = longname.replace(' ', '_') return Character.cache.property_alias[longname.casefold()].alias
[docs] def expand_property(prop: str) -> str: """Translate the short name of a Unicode property into the long name for that property. :param prop: The short name of the property. :return: The long name as a :class:`str`. :rtype: str :usage: To get the long name of a Unicode property: >>> prop = 'cf' >>> expand_property(prop) 'Case Folding' """ long = Character.cache.property_name[prop.casefold()].name long = long.replace('_', ' ') return long
[docs] def expand_property_value(prop: str, alias: str) -> str: """Translate the short name of a Unicode property value into the long name for that property. :param prop: The type of property. :param alias: The short name to translate. :return: The long name of the property as a :class:`str`. :rtype: str :usage: To get the long name for a property value: >>> alias = 'Cc' >>> prop = 'gc' >>> expand_property_value(prop, alias) 'Control' """ prop = prop.casefold() alias = alias.casefold() long = Character.cache.value_name[prop][alias].name return long.replace('_', ' ')
[docs] def filter_by_property( prop: str, value: str, chars: Sequence[Character] | None = None, insensitive: bool = False, regex: bool = False ) -> Generator[Character, None, None]: """Return all the characters with the given property value. :param prop: The property to filter on. :param value: The pattern to filter on. :param chars: (Optional.) The characters to filter. Defaults to filtering all Unicode characters. :param insensitive: (Optional.) Whether the matching should be case insensitive. Defaults to false. :param regex: (Optional.) Whether the value should be used as a regular expression for the matching. Defaults to false. :return: the filtered characters as a :class:`collections.abc.Generator`. :rtype: collections.abc.Generator :usage: To get a generator that produces the Emoji modifiers: >>> prop = 'emod' >>> value = 'Y' >>> gen = filter_by_property(prop, value) >>> for char in gen: ... print(char.summarize()) ... 🏻 U+1F3FB (EMOJI MODIFIER FITZPATRICK TYPE-1-2) 🏼 U+1F3FC (EMOJI MODIFIER FITZPATRICK TYPE-3) 🏽 U+1F3FD (EMOJI MODIFIER FITZPATRICK TYPE-4) 🏾 U+1F3FE (EMOJI MODIFIER FITZPATRICK TYPE-5) 🏿 U+1F3FF (EMOJI MODIFIER FITZPATRICK TYPE-6) You can limit the number of characters being searched with the `chars` parameter: >>> prop = 'gc' >>> value = 'Cc' >>> chars = [Character(chr(n)) for n in range(128)] >>> gen = filter_by_property(prop, value, chars) >>> for char in gen: ... print(char.summarize()) ... ␀ U+0000 (<NULL>) ␁ U+0001 (<START OF HEADING>) ␂ U+0002 (<START OF TEXT>) ␃ U+0003 (<END OF TEXT>) ␄ U+0004 (<END OF TRANSMISSION>) ␅ U+0005 (<ENQUIRY>) ␆ U+0006 (<ACKNOWLEDGE>) ␇ U+0007 (<BELL>) ␈ U+0008 (<BACKSPACE>) ␉ U+0009 (<CHARACTER TABULATION>) ␊ U+000A (<LINE FEED (LF)>) ␋ U+000B (<LINE TABULATION>) ␌ U+000C (<FORM FEED (FF)>) ␍ U+000D (<CARRIAGE RETURN (CR)>) ␎ U+000E (<SHIFT OUT>) ␏ U+000F (<SHIFT IN>) ␐ U+0010 (<DATA LINK ESCAPE>) ␑ U+0011 (<DEVICE CONTROL ONE>) ␒ U+0012 (<DEVICE CONTROL TWO>) ␓ U+0013 (<DEVICE CONTROL THREE>) ␔ U+0014 (<DEVICE CONTROL FOUR>) ␕ U+0015 (<NEGATIVE ACKNOWLEDGE>) ␖ U+0016 (<SYNCHRONOUS IDLE>) ␗ U+0017 (<END OF TRANSMISSION BLOCK>) ␘ U+0018 (<CANCEL>) ␙ U+0019 (<END OF MEDIUM>) ␚ U+001A (<SUBSTITUTE>) ␛ U+001B (<ESCAPE>) ␜ U+001C (<INFORMATION SEPARATOR FOUR>) ␝ U+001D (<INFORMATION SEPARATOR THREE>) ␞ U+001E (<INFORMATION SEPARATOR TWO>) ␟ U+001F (<INFORMATION SEPARATOR ONE>) ⑿ U+007F (<DELETE>) You can set the `insensitive` parameter to do case insensitive matching: >>> prop = 'emod' >>> value = 'y' >>> insensitive = True >>> gen = filter_by_property(prop, value, insensitive=insensitive) >>> for char in gen: ... print(char.summarize()) ... 🏻 U+1F3FB (EMOJI MODIFIER FITZPATRICK TYPE-1-2) 🏼 U+1F3FC (EMOJI MODIFIER FITZPATRICK TYPE-3) 🏽 U+1F3FD (EMOJI MODIFIER FITZPATRICK TYPE-4) 🏾 U+1F3FE (EMOJI MODIFIER FITZPATRICK TYPE-5) 🏿 U+1F3FF (EMOJI MODIFIER FITZPATRICK TYPE-6) If you set the `regex` parameter, you can search using regular expressions: >>> prop = 'na' >>> value = '.*EYE$' >>> regex = True >>> gen = filter_by_property(prop, value, regex=regex) >>> for char in gen: ... print(char.summarize()) ... ◉ U+25C9 (FISHEYE) ◎ U+25CE (BULLSEYE) ⺫ U+2EAB (CJK RADICAL EYE) ⽬ U+2F6C (KANGXI RADICAL EYE) 👁 U+1F441 (EYE) 😜 U+1F61C (FACE WITH STUCK-OUT TONGUE AND WINKING EYE) 🤪 U+1F92A (GRINNING FACE WITH ONE LARGE AND ONE SMALL EYE) 🫣 U+1FAE3 (FACE WITH PEEKING EYE) .. _warning: If you don't limit the characters you are doing the filter on, this will be a single-threaded regular expression comparison on 1,114,111 characters. In other words, it's not the speediest thing in the world. """ # Default to searching the full set of Unicode code points. if not chars: chars = [Character(n) for n in range(util.LEN_UNICODE)] # Regular expression matching. if regex: flags = 0 if insensitive: flags = re.IGNORECASE pattern = re.compile(value, flags=flags) for char in chars: try: if pattern.match(getattr(char, prop)): yield char except KeyError: continue # Case-insensitive string matching. elif insensitive: value = value.casefold() for char in chars: try: if getattr(char, prop).casefold() == value: yield char except KeyError: continue # String matching. else: for char in chars: try: if getattr(char, prop) == value: yield char except KeyError: continue
def get_category_members(category: str) -> tuple[Character, ...]: """Get all characters that are members of the given category.""" ulen = 0x10FFFF members = ( Character(n) for n in range(ulen) if ucd.category(chr(n)) == category ) return tuple(members)
[docs] def get_properties() -> tuple[str, ...]: """Get the valid Unicode properties. :return: The properties as a :class:`tuple`. :rtype: tuple :usage: To get the list of Unicode properties: >>> get_properties() # doctest: +ELLIPSIS ('age', 'ahex',... 'xo_nfkd') """ props = Character.cache.property_alias result = [] for key in props: if props[key] not in result: result.append(props[key]) aliases = tuple(prop.alias for prop in result) saliases = sorted(alias.casefold() for alias in aliases) return tuple(saliases)
[docs] def get_property_values(prop: str) -> tuple[str, ...]: """Get the valid property value aliases for a property. :param prop: The short name of the property. :return: The valid values for the property as a :class:`tuple`. :rtype: tuple :usage: To get the valid property values:: >>> prop = 'gc' >>> get_property_values(prop) # doctest: +ELLIPSIS ('C', 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'L',... 'Zs') """ propvals = Character.cache.value_aliases[prop] result = [] for key in propvals: if propvals[key] not in result: result.append(propvals[key]) return tuple(val.alias for val in result)
def make_flag(code: str) -> str: """Get the flag emoji for a region code. :param code: The region code for the flag. These can be two character strings as defined by ISO 3166-1 alpha 2, which is also used to define the country code top level domains. It will also try to make longer ISO 3166-2 country region codes work. :return: The country code as regional indicator symbol characters as a :class:`str`, which should render as the flag for the given country code. :rtype: str :usage: To get the flag for the United Kingdom: >>> make_flag('gb') '🇬🇧' This doesn't do any checking against ISO 3166-1 alpha 2, so codes that don't exist will still return, they just won't render as a flag: >>> make_flag('uk') '🇺🇰' :mod:`charex` doesn't control how glyphs are rendered by the terminal, application, or operating system rendering them. This may lead to some flags not rendering if they aren't supported by the system you are running on. >>> c.make_flag('tw') '🇹🇼' :func:`charex.make_flag` will interpret strings longer than two characters as OSI 3166-2 country region codes, and try to turn them into flags. Whether the actual flag renders will depend on whether your terminal, application, and operating system support it. >>> make_flag('GB-WLS') '🏴󠁧󠁢󠁷󠁬󠁳󠁿󠁢󠁷󠁬󠁳󠁿' >>> make_flag('us-il') '🏴\U000e0075\U000e0073\U000e0069\U000e006c\U000e007f' """ code = code.casefold() # Flags for county codes. if len(code) == 2: offset = 0x1f185 return ''.join(chr(offset + ord(c)) for c in code) else: offset = 0xe0000 flag = '\U0001F3F4' end = '\U000E007F' nums = [offset + ord(c) for c in code if c != '-'] return flag + ''.join(chr(n) for n in nums) + end def validate_normalization_form(form: str) -> NormForms: """Validate whether the given data is a normalization form. :param form: A :mod:`str` that should be a normalization form. :return: A validated normalization form. :rtype: str :usage: To validate a normalization form:: >>> form = 'NFD' >>> form == validate_normalization_form(form) True """ normal = form.upper() if normal in normalization_forms: return cast(NormForms, normal) else: msg = f'{form} is not a valid normalization form.' raise InvalidNormalizationFormError(msg)