Source code for charex.charsets

"""
charsets
~~~~~~~~

Data and functions for working with character sets.
"""
from collections.abc import Iterator
from dataclasses import dataclass
from sys import byteorder

from charex import util


# Data classes.
@dataclass
class CodecDetails:
    """Information for working with the specific codec.

    :param size: (Optional.) The number of bytes used to address
        characters with the given character set.
    :param endian: (Optional.) The byte order used by the codec.
    """
    size: int = 1
    endian: str = byteorder
    description: str = ''


# Encoding schemes.
codecs = {
    'ascii': CodecDetails(
        description=(
            'RFC20 The ASCII format for Network Interchange.'
        )
    ),
    'big5': CodecDetails(
        description=(
            'The Big5 encoding method for traditional Chinese characters '
            'developed by the Institute for Information Industry of Taiwan '
            'in 1984.'
        )
    ),
    'big5hkscs': CodecDetails(
        description=(
            'Hong Kong Supplementary Character Set to the Big5 traditional '
            'Chinese character set.'
        )
    ),
    'cp037': CodecDetails(
        description=(
            'EBCDIC code page 37, USA/Canada Country Extended Code Page.'
        )
    ),
    'cp273': CodecDetails(
        description=(
            'EBCDIC code page 273, Germany/Austria.'
        )
    ),
    'cp424': CodecDetails(
        description=(
            'EBCDIC code page 424, Israel with supprt for Hebrew.'
        )
    ),
    'cp437': CodecDetails(
        description=(
            'Code page 424, default character set for the IBM PC.'
        )
    ),
    'cp500': CodecDetails(
        description=(
            'EBCDIC code page 500, full support of the Latin-1 character set.'
        )
    ),
    'cp720': CodecDetails(
        description=(
            'Code page 720, Arabic support for DOS.'
        )
    ),
    'cp737': CodecDetails(
        description=(
            'Code page 720, Greek support for DOS.'
        )
    ),
    'cp775': CodecDetails(
        description=(
            'Code page 775, Baltic language support for DOS.'
        )
    ),
    'cp850': CodecDetails(
        description=(
            'Code page 850, Western European language support for DOS.'
        )
    ),
    'cp852': CodecDetails(
        description=(
            'Code page 852, Central European language support for DOS.'
        )
    ),
    'cp855': CodecDetails(
        description=(
            'Code page 855, Cyrillic support for DOS.'
        )
    ),
    'cp856': CodecDetails(
        description=(
            'Code page 856, Hebrew language support for DOS.'
        )
    ),
    'cp857': CodecDetails(
        description=(
            'Code page 857, Turkish language support for DOS.'
        )
    ),
    'cp858': CodecDetails(
        description=(
            'Code page 858, Western European language support for DOS, '
            'modifying code page 850 by adding the Euro symbol.'
        )
    ),
    'cp860': CodecDetails(
        description=(
            'Code page 860, Portugese language support for DOS.'
        )
    ),
    'cp861': CodecDetails(
        description=(
            'Code page 861, Icelandic language support for DOS.'
        )
    ),
    'cp862': CodecDetails(
        description=(
            'Code page 862, Hebrew language support for DOS.'
        )
    ),
    'cp863': CodecDetails(
        description=(
            'Code page 863, Canadian French language support for DOS.'
        )
    ),
    'cp864': CodecDetails(
        description=(
            'Code page 864, Hebrew language support for DOS.'
        )
    ),
    'cp865': CodecDetails(
        description=(
            'Code page 865, Arabic language support for DOS.'
        )
    ),
    'cp866': CodecDetails(
        description=(
            'Code page 866, Nordic language support for DOS.'
        )
    ),
    'cp869': CodecDetails(
        description=(
            'Code page 869, Greek language support for DOS.'
        )
    ),
    'cp874': CodecDetails(
        description=(
            'Code page 874, Thai language support for DOS.'
        )
    ),
    'cp875': CodecDetails(
        description=(
            'EBCDIC code page 875, Greek.'
        )
    ),
    'cp932': CodecDetails(
        description=(
            'Code page 932, Japanese language support for Windows.'
        )
    ),
    'cp949': CodecDetails(
        description=(
            'Code page 949, Korean language support by IBM.'
        )
    ),
    'cp950': CodecDetails(
        description=(
            'Code page 932, Traditional Chinese language support for Windows.'
        )
    ),
    'cp1006': CodecDetails(
        description=(
            'Code page 1006, Urdu language support for AIX.'
        )
    ),
    'cp1026': CodecDetails(
        description=(
            'EBCDIC code page 1026, Turkish.'
        )
    ),
    'cp1125': CodecDetails(
        description=(
            'IBM code page 1125, Ukraine.'
        )
    ),
    'cp1140': CodecDetails(
        description=(
            'EBCDIC code page 1140, USA/Canada with Euro character.'
        )
    ),
    'cp1250': CodecDetails(
        description=(
            'Code page 1250, Central European language support for Windows.'
        )
    ),
    'cp1251': CodecDetails(
        description=(
            'Code page 1251, Cyrillic support for Windows.'
        )
    ),
    'cp1252': CodecDetails(
        description=(
            'Code page 1252, Latin-1 character set for Windows.'
        )
    ),
    'cp1253': CodecDetails(
        description=(
            'Code page 1253, Greek support for Windows.'
        )
    ),
    'cp1254': CodecDetails(
        description=(
            'Code page 1254, Turkish support for Windows.'
        )
    ),
    'cp1255': CodecDetails(
        description=(
            'Code page 1255, Hebrew support for Windows.'
        )
    ),
    'cp1256': CodecDetails(
        description=(
            'Code page 1256, Arabic support for Windows.'
        )
    ),
    'cp1257': CodecDetails(
        description=(
            'Code page 1257, Baltic language support for Windows.'
        )
    ),
    'cp1258': CodecDetails(
        description=(
            'Code page 1258, Vietamese support for Windows.'
        )
    ),
    'euc_jp': CodecDetails(
        description=(
            'Extended Unix Code Japanese.'
        )
    ),
    'euc_jis_2004': CodecDetails(
        description=(
            'Extended Unix Code Japanese Industrial Standard 2004.'
        )
    ),
    'euc_jisx0213': CodecDetails(
        description=(
            'Extended Unix Code Japanese Industrial Standard X 213.'
        )
    ),
    'euc_kr': CodecDetails(
        description=(
            'Extended Unix Code Korean.'
        )
    ),
    'gb2312': CodecDetails(
        description=(
            'Extended Unix Code Simplified Chinese.'
        )
    ),
    'gbk': CodecDetails(
        description=(
            'Extended Unix Code Simplified Chinese extended to include '
            'all unified CJK characters.'
        )
    ),
    'gb18030': CodecDetails(
        description=(
            'Chinese National Standard GB 18030-2005: Information '
            'Technology—Chinese coded character set.'
        )
    ),
    'hz': CodecDetails(
        description=(
            'Extended Unix Code Simplified Chinese for email.'
        )
    ),
    'iso2022_jp': CodecDetails(
        description=(
            'ISO 2022 standard for Japanese.'
        )
    ),
    'iso2022_jp_1': CodecDetails(
        description=(
            'ISO 2022 standard for Japanese, extension 1.'
        )
    ),
    'iso2022_jp_2': CodecDetails(
        description=(
            'ISO 2022 standard for Japanese, extension 2.'
        )
    ),
    'iso2022_jp_2004': CodecDetails(
        description=(
            'ISO 2022 standard for Japanese, extension 2004.'
        )
    ),
    'iso2022_jp_3': CodecDetails(
        description=(
            'ISO 2022 standard for Japanese, extension 3.'
        )
    ),
    'iso2022_jp_ext': CodecDetails(
        description=(
            'ISO 2022 standard for Japanese, extension.'
        )
    ),
    'iso2022_kr': CodecDetails(
        description=(
            'RFC1557 Korean Character Encoding for Internet Messages.'
        )
    ),
    'latin_1': CodecDetails(
        description=(
            'ISO-8859-1, Latin alphabet number 1 for western Europe.'
        )
    ),
    'iso8859_2': CodecDetails(
        description=(
            'ISO-8859-2, Latin alphabet number 2 for central Europe.'
        )
    ),
    'iso8859_3': CodecDetails(
        description=(
            'ISO-8859-3, Latin alphabet number 3 for southern Europe.'
        )
    ),
    'iso8859_4': CodecDetails(
        description=(
            'ISO-8859-4, Latin alphabet number 4 for northern Europe.'
        )
    ),
    'iso8859_5': CodecDetails(
        description=(
            'ISO-8859-5, Latin/Cyrillic alphabet.'
        )
    ),
    'iso8859_6': CodecDetails(
        description=(
            'ISO-8859-6, Latin/Arabic alphabet.'
        )
    ),
    'iso8859_7': CodecDetails(
        description=(
            'ISO-8859-7, Latin/Greek alphabet.'
        )
    ),
    'iso8859_8': CodecDetails(
        description=(
            'ISO-8859-8, Latin/Herbrew alphabet.'
        )
    ),
    'iso8859_9': CodecDetails(
        description=(
            'ISO-8859-9, Latin alphabet number 5 for Turkish.'
        )
    ),
    'iso8859_10': CodecDetails(
        description=(
            'ISO-8859-10, Latin alphabet number 6 for Nordic languages.'
        )
    ),
    'iso8859_11': CodecDetails(
        description=(
            'ISO-8859-11, Latin/Thai alphabet.'
        )
    ),
    'iso8859_13': CodecDetails(
        description=(
            'ISO-8859-13, Latin alphabet number 7 for Baltic Rim languages.'
        )
    ),
    'iso8859_14': CodecDetails(
        description=(
            'ISO-8859-14, Latin alphabet number 8 for Celtic languages.'
        )
    ),
    'iso8859_15': CodecDetails(
        description=(
            'ISO-8859-15, Latin alphabet number 9 for Western European '
            'languages, including the Euro symbol.'
        )
    ),
    'iso8859_16': CodecDetails(
        description=(
            'ISO-8859-16, Latin alphabet number 10 for south-eastern Europe.'
        )
    ),
    'johab': CodecDetails(
        description=(
            'KS X 1001 alternative character set for South Korean Hangul '
            'and Hanja.'
        )
    ),
    'koi8_r': CodecDetails(
        description=(
            'Kod Obmena Informatsiey, 8 bit, for Russian and Bulgarian.'
        )
    ),
    'koi8_t': CodecDetails(
        description=(
            'Kod Obmena Informatsiey, 8 bit, for Tajik Cyrillic.'
        )
    ),
    'koi8_u': CodecDetails(
        description=(
            'RFC2319 Ukrainian Character Set KOI8-U.'
        )
    ),
    'kz1048': CodecDetails(
        description=(
            'Windows-1251 variant for Kazakh.'
        )
    ),
    'mac_cyrillic': CodecDetails(
        description=(
            'Mac OS Cyrillic.'
        )
    ),
    'mac_greek': CodecDetails(
        description=(
            'Mac OS Greek.'
        )
    ),
    'mac_iceland': CodecDetails(
        description=(
            'Mac OS Icelandic.'
        )
    ),
    'mac_latin2': CodecDetails(
        description=(
            'Mac OS Central European, Microsoft code page 10029.'
        )
    ),
    'mac_roman': CodecDetails(
        description=(
            'Mac OS Western Europe.'
        )
    ),
    'mac_turkish': CodecDetails(
        description=(
            'Mac OS Turkish.'
        )
    ),
    'ptcp154': CodecDetails(
        description=(
            'Cyrillic-Asian.'
        )
    ),
    'shift_jis': CodecDetails(
        2,
        description=(
            'Japanese Industrial Standard with shifted first bytes.'
        )
    ),
    'shift_jis_2004': CodecDetails(
        2,
        description=(
            'Superset of Japanese Industrial Standard with shifted '
            'first bytes.'
        )
    ),
    'shift_jisx0213': CodecDetails(
        2,
        description=(
            'Superset of Japanese Industrial Standard with shifted '
            'first bytes.'
        )
    ),
    'utf_32': CodecDetails(
        4,
        description=(
            '32-bit Unicode Transformation format.'
        )
    ),
    'utf_32_be': CodecDetails(
        4,
        'big',
        description=(
            '32-bit Unicode Transformation format, big endian.'
        )
    ),
    'utf_32_le': CodecDetails(
        4,
        'little',
        description=(
            '32-bit Unicode Transformation format, little endian.'
        )
    ),
    'utf_16': CodecDetails(
        2,
        description=(
            '16-bit Unicode Transformation format.'
        )
    ),
    'utf_16_be': CodecDetails(
        2,
        'big',
        description=(
            '16-bit Unicode Transformation format, big endian.'
        )
    ),
    'utf_16_le': CodecDetails(
        2,
        'little',
        description=(
            '16-bit Unicode Transformation format, little endian.'
        )
    ),
    'utf_7': CodecDetails(
        description=(
            '7-bit Unicode Transformation format.'
        )
    ),
    'utf_8': CodecDetails(
        description=(
            '8-bit Unicode Transformation format.'
        )
    ),
    'utf_8_sig': CodecDetails(
        description=(
            '8-bit Unicode Transformation format, treating the BOM '
            'as metadata.'
        )
    ),
}


# Functions.
[docs] def get_codecs() -> tuple[str, ...]: """Return the keys of the registered codecs. :return: The keys of the codecs as a :class:`tuple`. :rtype: tuple :usage: To get a tuple containing the keys of the registered codecs:: >>> get_codecs() # +ELLIPSIS ('ascii', 'big5', 'big5hkscs', 'cp037'... 'utf_8', 'utf_8_sig') """ return tuple(codec for codec in codecs)
def get_description(codeckey: str) -> str: """Provide the description for the given codec. :param codeckey: The key for the codec. :return: The description of the codec as a :class:`str`. :rtype: str :usage: To get the description for the given codec key:: >>> get_description('ascii') 'RFC20 The ASCII format for Network Interchange.' """ info = codecs[codeckey] return info.description
[docs] def multidecode( value: int | str | bytes, codecs_: Iterator[str] | None = None ) -> dict[str, str]: """Provide the character for the given address for each of the given character sets. :param value: The address to decode. :param codec_: The codecs to decode to. :return: The decoded value for each character set as a :class:`dict`. :rtype: dict :usage: To get the character for the given address for each of the registered codecs: >>> address = '0x61' >>> multidecode(address) # +ELLIPSIS {'ascii': 'a', 'big5': 'a'... 'utf_8_sig': 'a'} If you just want the UTF-8 character: >>> value = 'a' >>> codecs_ = ('utf_8',) >>> multidecode(value, codecs_) {'utf_8': 'a'} :address formats: The understood :class:`str` formats for manual input are: * Character: A string with length equal to one. * Code Point: The prefix "U+" followed by a hexadecimal number. * Binary String: The prefix "0b" followed by a binary number. * Hex String: The prefix "0x" followed by a hexadecimal number. The following formats are available for use through the API: * Bytes: A :class:`bytes`. * Integer: An :class:`int`. """ # Coerce the given value into bytes. value = util.to_bytes(value) # Decode the value into the character sets. results = {} if codecs_ is None: codecs_ = (codec for codec in get_codecs()) for codec in codecs_: b = value # Pad for 2 or 4 byte codecs. while len(b) < codecs[codec].size: if codecs[codec].endian == 'little': b = b + b'\x00' else: b = b'\x00' + b # Decode. try: results[codec] = b.decode(codec) except UnicodeDecodeError: results[codec] = '' return results
[docs] def multiencode( value: bytes | int | str, codecs_: Iterator[str] | None = None ) -> dict[str, bytes]: """Provide the address for the given character for each of the given character sets. :param value: The character to encode. :param codecs_: The codecs to encode to. :return: The encoded value for each character set as a :class:`dict`. :rtype: dict :usage: To encode a one character :class:`str` with all registered codecs: >>> value = 'a' >>> multiencode(value) # +ELLIPSIS {'ascii': b'a', 'big5': b'a'... 'utf_8_sig': b'\xef\xbb\xbfa'} If you just want the UTF-8 address: >>> value = 'a' >>> codecs_ = ('utf_8',) >>> multiencode(value, codecs_) {'utf_8': b'a'} :character formats: The understood :class:`str` formats available for manual input are (all formats are big endian unless otherwise stated): * Character: A string with length equal to one. * Code Point: The prefix "U+" followed by a hexadecimal number. * Binary String: The prefix "0b" followed by a binary number. * Octal String: The prefix "0o" followed by an octal number. * Decimal String: The prefix "0d" followed by a decimal number. * Hex String: The prefix "0x" followed by a hexadecimal number. The following formats are available for use through the API: * Bytes: A :class:`bytes` that decodes to a valid UTF-8 character. * Integer: An :class:`int` within the range 0x00 <= x <= 0x10FFFF. """ value = util.to_char(value) if codecs_ is None: codecs_ = (codec for codec in get_codecs()) results = {} for codec in codecs_: try: results[codec] = value.encode(codec) except UnicodeEncodeError: results[codec] = b'' return results