"""
charsets
~~~~~~~~
Data and functions for working with character sets.
"""
from collections.abc import Iterator
from dataclasses import dataclass
from sys import byteorder
from charex import util
# Data classes.
@dataclass
class CodecDetails:
"""Information for working with the specific codec.
:param size: (Optional.) The number of bytes used to address
characters with the given character set.
:param endian: (Optional.) The byte order used by the codec.
"""
size: int = 1
endian: str = byteorder
description: str = ''
# Encoding schemes.
codecs = {
'ascii': CodecDetails(
description=(
'RFC20 The ASCII format for Network Interchange.'
)
),
'big5': CodecDetails(
description=(
'The Big5 encoding method for traditional Chinese characters '
'developed by the Institute for Information Industry of Taiwan '
'in 1984.'
)
),
'big5hkscs': CodecDetails(
description=(
'Hong Kong Supplementary Character Set to the Big5 traditional '
'Chinese character set.'
)
),
'cp037': CodecDetails(
description=(
'EBCDIC code page 37, USA/Canada Country Extended Code Page.'
)
),
'cp273': CodecDetails(
description=(
'EBCDIC code page 273, Germany/Austria.'
)
),
'cp424': CodecDetails(
description=(
'EBCDIC code page 424, Israel with supprt for Hebrew.'
)
),
'cp437': CodecDetails(
description=(
'Code page 424, default character set for the IBM PC.'
)
),
'cp500': CodecDetails(
description=(
'EBCDIC code page 500, full support of the Latin-1 character set.'
)
),
'cp720': CodecDetails(
description=(
'Code page 720, Arabic support for DOS.'
)
),
'cp737': CodecDetails(
description=(
'Code page 720, Greek support for DOS.'
)
),
'cp775': CodecDetails(
description=(
'Code page 775, Baltic language support for DOS.'
)
),
'cp850': CodecDetails(
description=(
'Code page 850, Western European language support for DOS.'
)
),
'cp852': CodecDetails(
description=(
'Code page 852, Central European language support for DOS.'
)
),
'cp855': CodecDetails(
description=(
'Code page 855, Cyrillic support for DOS.'
)
),
'cp856': CodecDetails(
description=(
'Code page 856, Hebrew language support for DOS.'
)
),
'cp857': CodecDetails(
description=(
'Code page 857, Turkish language support for DOS.'
)
),
'cp858': CodecDetails(
description=(
'Code page 858, Western European language support for DOS, '
'modifying code page 850 by adding the Euro symbol.'
)
),
'cp860': CodecDetails(
description=(
'Code page 860, Portugese language support for DOS.'
)
),
'cp861': CodecDetails(
description=(
'Code page 861, Icelandic language support for DOS.'
)
),
'cp862': CodecDetails(
description=(
'Code page 862, Hebrew language support for DOS.'
)
),
'cp863': CodecDetails(
description=(
'Code page 863, Canadian French language support for DOS.'
)
),
'cp864': CodecDetails(
description=(
'Code page 864, Hebrew language support for DOS.'
)
),
'cp865': CodecDetails(
description=(
'Code page 865, Arabic language support for DOS.'
)
),
'cp866': CodecDetails(
description=(
'Code page 866, Nordic language support for DOS.'
)
),
'cp869': CodecDetails(
description=(
'Code page 869, Greek language support for DOS.'
)
),
'cp874': CodecDetails(
description=(
'Code page 874, Thai language support for DOS.'
)
),
'cp875': CodecDetails(
description=(
'EBCDIC code page 875, Greek.'
)
),
'cp932': CodecDetails(
description=(
'Code page 932, Japanese language support for Windows.'
)
),
'cp949': CodecDetails(
description=(
'Code page 949, Korean language support by IBM.'
)
),
'cp950': CodecDetails(
description=(
'Code page 932, Traditional Chinese language support for Windows.'
)
),
'cp1006': CodecDetails(
description=(
'Code page 1006, Urdu language support for AIX.'
)
),
'cp1026': CodecDetails(
description=(
'EBCDIC code page 1026, Turkish.'
)
),
'cp1125': CodecDetails(
description=(
'IBM code page 1125, Ukraine.'
)
),
'cp1140': CodecDetails(
description=(
'EBCDIC code page 1140, USA/Canada with Euro character.'
)
),
'cp1250': CodecDetails(
description=(
'Code page 1250, Central European language support for Windows.'
)
),
'cp1251': CodecDetails(
description=(
'Code page 1251, Cyrillic support for Windows.'
)
),
'cp1252': CodecDetails(
description=(
'Code page 1252, Latin-1 character set for Windows.'
)
),
'cp1253': CodecDetails(
description=(
'Code page 1253, Greek support for Windows.'
)
),
'cp1254': CodecDetails(
description=(
'Code page 1254, Turkish support for Windows.'
)
),
'cp1255': CodecDetails(
description=(
'Code page 1255, Hebrew support for Windows.'
)
),
'cp1256': CodecDetails(
description=(
'Code page 1256, Arabic support for Windows.'
)
),
'cp1257': CodecDetails(
description=(
'Code page 1257, Baltic language support for Windows.'
)
),
'cp1258': CodecDetails(
description=(
'Code page 1258, Vietamese support for Windows.'
)
),
'euc_jp': CodecDetails(
description=(
'Extended Unix Code Japanese.'
)
),
'euc_jis_2004': CodecDetails(
description=(
'Extended Unix Code Japanese Industrial Standard 2004.'
)
),
'euc_jisx0213': CodecDetails(
description=(
'Extended Unix Code Japanese Industrial Standard X 213.'
)
),
'euc_kr': CodecDetails(
description=(
'Extended Unix Code Korean.'
)
),
'gb2312': CodecDetails(
description=(
'Extended Unix Code Simplified Chinese.'
)
),
'gbk': CodecDetails(
description=(
'Extended Unix Code Simplified Chinese extended to include '
'all unified CJK characters.'
)
),
'gb18030': CodecDetails(
description=(
'Chinese National Standard GB 18030-2005: Information '
'Technology—Chinese coded character set.'
)
),
'hz': CodecDetails(
description=(
'Extended Unix Code Simplified Chinese for email.'
)
),
'iso2022_jp': CodecDetails(
description=(
'ISO 2022 standard for Japanese.'
)
),
'iso2022_jp_1': CodecDetails(
description=(
'ISO 2022 standard for Japanese, extension 1.'
)
),
'iso2022_jp_2': CodecDetails(
description=(
'ISO 2022 standard for Japanese, extension 2.'
)
),
'iso2022_jp_2004': CodecDetails(
description=(
'ISO 2022 standard for Japanese, extension 2004.'
)
),
'iso2022_jp_3': CodecDetails(
description=(
'ISO 2022 standard for Japanese, extension 3.'
)
),
'iso2022_jp_ext': CodecDetails(
description=(
'ISO 2022 standard for Japanese, extension.'
)
),
'iso2022_kr': CodecDetails(
description=(
'RFC1557 Korean Character Encoding for Internet Messages.'
)
),
'latin_1': CodecDetails(
description=(
'ISO-8859-1, Latin alphabet number 1 for western Europe.'
)
),
'iso8859_2': CodecDetails(
description=(
'ISO-8859-2, Latin alphabet number 2 for central Europe.'
)
),
'iso8859_3': CodecDetails(
description=(
'ISO-8859-3, Latin alphabet number 3 for southern Europe.'
)
),
'iso8859_4': CodecDetails(
description=(
'ISO-8859-4, Latin alphabet number 4 for northern Europe.'
)
),
'iso8859_5': CodecDetails(
description=(
'ISO-8859-5, Latin/Cyrillic alphabet.'
)
),
'iso8859_6': CodecDetails(
description=(
'ISO-8859-6, Latin/Arabic alphabet.'
)
),
'iso8859_7': CodecDetails(
description=(
'ISO-8859-7, Latin/Greek alphabet.'
)
),
'iso8859_8': CodecDetails(
description=(
'ISO-8859-8, Latin/Herbrew alphabet.'
)
),
'iso8859_9': CodecDetails(
description=(
'ISO-8859-9, Latin alphabet number 5 for Turkish.'
)
),
'iso8859_10': CodecDetails(
description=(
'ISO-8859-10, Latin alphabet number 6 for Nordic languages.'
)
),
'iso8859_11': CodecDetails(
description=(
'ISO-8859-11, Latin/Thai alphabet.'
)
),
'iso8859_13': CodecDetails(
description=(
'ISO-8859-13, Latin alphabet number 7 for Baltic Rim languages.'
)
),
'iso8859_14': CodecDetails(
description=(
'ISO-8859-14, Latin alphabet number 8 for Celtic languages.'
)
),
'iso8859_15': CodecDetails(
description=(
'ISO-8859-15, Latin alphabet number 9 for Western European '
'languages, including the Euro symbol.'
)
),
'iso8859_16': CodecDetails(
description=(
'ISO-8859-16, Latin alphabet number 10 for south-eastern Europe.'
)
),
'johab': CodecDetails(
description=(
'KS X 1001 alternative character set for South Korean Hangul '
'and Hanja.'
)
),
'koi8_r': CodecDetails(
description=(
'Kod Obmena Informatsiey, 8 bit, for Russian and Bulgarian.'
)
),
'koi8_t': CodecDetails(
description=(
'Kod Obmena Informatsiey, 8 bit, for Tajik Cyrillic.'
)
),
'koi8_u': CodecDetails(
description=(
'RFC2319 Ukrainian Character Set KOI8-U.'
)
),
'kz1048': CodecDetails(
description=(
'Windows-1251 variant for Kazakh.'
)
),
'mac_cyrillic': CodecDetails(
description=(
'Mac OS Cyrillic.'
)
),
'mac_greek': CodecDetails(
description=(
'Mac OS Greek.'
)
),
'mac_iceland': CodecDetails(
description=(
'Mac OS Icelandic.'
)
),
'mac_latin2': CodecDetails(
description=(
'Mac OS Central European, Microsoft code page 10029.'
)
),
'mac_roman': CodecDetails(
description=(
'Mac OS Western Europe.'
)
),
'mac_turkish': CodecDetails(
description=(
'Mac OS Turkish.'
)
),
'ptcp154': CodecDetails(
description=(
'Cyrillic-Asian.'
)
),
'shift_jis': CodecDetails(
2,
description=(
'Japanese Industrial Standard with shifted first bytes.'
)
),
'shift_jis_2004': CodecDetails(
2,
description=(
'Superset of Japanese Industrial Standard with shifted '
'first bytes.'
)
),
'shift_jisx0213': CodecDetails(
2,
description=(
'Superset of Japanese Industrial Standard with shifted '
'first bytes.'
)
),
'utf_32': CodecDetails(
4,
description=(
'32-bit Unicode Transformation format.'
)
),
'utf_32_be': CodecDetails(
4,
'big',
description=(
'32-bit Unicode Transformation format, big endian.'
)
),
'utf_32_le': CodecDetails(
4,
'little',
description=(
'32-bit Unicode Transformation format, little endian.'
)
),
'utf_16': CodecDetails(
2,
description=(
'16-bit Unicode Transformation format.'
)
),
'utf_16_be': CodecDetails(
2,
'big',
description=(
'16-bit Unicode Transformation format, big endian.'
)
),
'utf_16_le': CodecDetails(
2,
'little',
description=(
'16-bit Unicode Transformation format, little endian.'
)
),
'utf_7': CodecDetails(
description=(
'7-bit Unicode Transformation format.'
)
),
'utf_8': CodecDetails(
description=(
'8-bit Unicode Transformation format.'
)
),
'utf_8_sig': CodecDetails(
description=(
'8-bit Unicode Transformation format, treating the BOM '
'as metadata.'
)
),
}
# Functions.
[docs]
def get_codecs() -> tuple[str, ...]:
"""Return the keys of the registered codecs.
:return: The keys of the codecs as a :class:`tuple`.
:rtype: tuple
:usage:
To get a tuple containing the keys of the registered codecs::
>>> get_codecs() # +ELLIPSIS
('ascii', 'big5', 'big5hkscs', 'cp037'... 'utf_8', 'utf_8_sig')
"""
return tuple(codec for codec in codecs)
def get_description(codeckey: str) -> str:
"""Provide the description for the given codec.
:param codeckey: The key for the codec.
:return: The description of the codec as a :class:`str`.
:rtype: str
:usage:
To get the description for the given codec key::
>>> get_description('ascii')
'RFC20 The ASCII format for Network Interchange.'
"""
info = codecs[codeckey]
return info.description
[docs]
def multidecode(
value: int | str | bytes,
codecs_: Iterator[str] | None = None
) -> dict[str, str]:
"""Provide the character for the given address for each of the
given character sets.
:param value: The address to decode.
:param codec_: The codecs to decode to.
:return: The decoded value for each character set as a :class:`dict`.
:rtype: dict
:usage:
To get the character for the given address for each of the registered
codecs:
>>> address = '0x61'
>>> multidecode(address) # +ELLIPSIS
{'ascii': 'a', 'big5': 'a'... 'utf_8_sig': 'a'}
If you just want the UTF-8 character:
>>> value = 'a'
>>> codecs_ = ('utf_8',)
>>> multidecode(value, codecs_)
{'utf_8': 'a'}
:address formats:
The understood :class:`str` formats for manual input are:
* Character: A string with length equal to one.
* Code Point: The prefix "U+" followed by a hexadecimal number.
* Binary String: The prefix "0b" followed by a binary number.
* Hex String: The prefix "0x" followed by a hexadecimal number.
The following formats are available for use through the API:
* Bytes: A :class:`bytes`.
* Integer: An :class:`int`.
"""
# Coerce the given value into bytes.
value = util.to_bytes(value)
# Decode the value into the character sets.
results = {}
if codecs_ is None:
codecs_ = (codec for codec in get_codecs())
for codec in codecs_:
b = value
# Pad for 2 or 4 byte codecs.
while len(b) < codecs[codec].size:
if codecs[codec].endian == 'little':
b = b + b'\x00'
else:
b = b'\x00' + b
# Decode.
try:
results[codec] = b.decode(codec)
except UnicodeDecodeError:
results[codec] = ''
return results
[docs]
def multiencode(
value: bytes | int | str,
codecs_: Iterator[str] | None = None
) -> dict[str, bytes]:
"""Provide the address for the given character for each of the
given character sets.
:param value: The character to encode.
:param codecs_: The codecs to encode to.
:return: The encoded value for each character set as a :class:`dict`.
:rtype: dict
:usage:
To encode a one character :class:`str` with all registered codecs:
>>> value = 'a'
>>> multiencode(value) # +ELLIPSIS
{'ascii': b'a', 'big5': b'a'... 'utf_8_sig': b'\xef\xbb\xbfa'}
If you just want the UTF-8 address:
>>> value = 'a'
>>> codecs_ = ('utf_8',)
>>> multiencode(value, codecs_)
{'utf_8': b'a'}
:character formats:
The understood :class:`str` formats available for manual input are
(all formats are big endian unless otherwise stated):
* Character: A string with length equal to one.
* Code Point: The prefix "U+" followed by a hexadecimal number.
* Binary String: The prefix "0b" followed by a binary number.
* Octal String: The prefix "0o" followed by an octal number.
* Decimal String: The prefix "0d" followed by a decimal number.
* Hex String: The prefix "0x" followed by a hexadecimal number.
The following formats are available for use through the API:
* Bytes: A :class:`bytes` that decodes to a valid UTF-8
character.
* Integer: An :class:`int` within the range 0x00 <= x <=
0x10FFFF.
"""
value = util.to_char(value)
if codecs_ is None:
codecs_ = (codec for codec in get_codecs())
results = {}
for codec in codecs_:
try:
results[codec] = value.encode(codec)
except UnicodeEncodeError:
results[codec] = b''
return results