Source code for charex.escape

"""
escape
~~~~~~

Character escape schemes.
"""
import random
from collections.abc import Callable
from json import loads

from charex import util
from charex.db import cache, get_characters_in_block


# Registry.
schemes: dict[str, Callable[[str, str], str]] = {}


# Caches.
cached_entities: dict[str, str] = {}


# Registration.
[docs] class reg_escape: """A decorator for registering escape schemes. :param key: The name the escape sequence is registered under. :usage: To register a new escape scheme: >>> @reg_escape('double') ... def double(char: str, codec: str) -> str: ... '''Double the character.''' ... return char + char ... >>> # Demonstrate the registration worked. >>> 'double' in get_schemes() True >>> escape_text('spam', 'double') 'ssppaamm' """ def __init__(self, key: str) -> None: self.key = key def __call__( self, fn: Callable[[str, str], str] ) -> Callable[[str, str], str]: schemes[self.key] = fn return fn
# Exceptions. class EscapeError(ValueError): """The escape scheme could not escape the character.""" # Utility functions. def get_named_entity(char: str) -> str: """Get a named entity from the HTML entity data.""" code = util.to_code(char).casefold() if code in cache.entity_map: return cache.entity_map[code][-1].name return escape_htmldec(char, '') def get_description(schemekey: str) -> str: """Get the description for the scheme. :param schemekey: The key for the scheme in the scheme registry. :return: The description as a :class:`str`. :rtype: str """ scheme = schemes[schemekey] return util.get_description_from_docstring(scheme)
[docs] def get_schemes() -> tuple[str, ...]: """Return the keys of the registered escape schemes. :return: The scheme keys as a :class:`tuple`. :rtype: tuple """ return tuple(scheme for scheme in schemes)
def hex_byte_escape(char: str) -> str: """Perform the common single hexadecimal byte escape on the character. :param char: The character to escape. :return: The escaped character as a :class:`str`. :rtype: str """ n = ord(char) if n > 0xFF: raise EscapeError('Cannot escape characters over 0xFF.') return f'\\x{n:02x}' def lookup_escape(char: str, table: dict[str, str]) -> str: """Perform a table lookup to escape the character. :param char: The character to escape. :param table: The table for the lookup. :return: The escaped character as a :class:`str`. :rtype: str """ try: return table[char] except KeyError: raise EscapeError('Character not in table.') def octal_escape(char: str) -> str: """Perform the common octal escape on the character. :param char: The character to escape. :return: The escaped character as a :class:`str`. :rtype: str """ n = ord(char) if n > 0o377: raise EscapeError('Cannot escape characters over 0o377.') return f'\\{n:o}' def unicode_2_byte_escape(char: str) -> str: """Perform the common Unicode two byte escape on the character. :param char: The character to escape. :return: The escaped character as a :class:`str`. :rtype: str """ n = ord(char) if n > 0xFFFF: raise EscapeError('Cannot escape characters over 0xFFFF.') return util.to_code(n, '\\u') def unicode_utf16_escape(char: str) -> str: """Perform the common Unicode UTF-16 escape on the character. :param char: The character to escape. :return: The escaped character as a :class:`str`. :rtype: str """ try: return unicode_2_byte_escape(char) except EscapeError: b = char.encode('utf_16_be') result = '' for i in range(0, len(b), 2): result += '\\u' result += f'{b[i]:02x}' result += f'{b[i + 1]:02x}' return result # Escape schemes. @reg_escape('c') def escape_c(char: str, codec: str) -> str: """Escape scheme for C escape sequences as defined by C17. This is derived from the Wikipedia list, since I don't have access to the C17 specification. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ table = { '\u0007': r'\a', '\u0008': r'\b', '\u000c': r'\f', '\u000a': r'\n', '\u000d': r'\r', '\u0009': r'\t', '\u000b': r'\v', # '\u001b': r'\e', # Non-standard, supported by gcc, clang, tcc. '\u0027': r"\'", '\u0022': r'\"', '\u003f': r'\?', '\u005c': r'\\', } try: return lookup_escape(char, table) except EscapeError: return escape_co(char, codec) @reg_escape('co') def escape_co(char: str, codec: str) -> str: """Escape scheme for C octal escape sequences as defined by C17. This is derived from the Wikipedia list, since I don't have access to the C17 specification. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ try: return octal_escape(char) except EscapeError: return escape_cu(char, codec) @reg_escape('cu') def escape_cu(char: str, codec: str) -> str: """Escape scheme for C Unicode escape sequences as defined by C17. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ try: return unicode_2_byte_escape(char) except EscapeError: return escape_culong(char, codec) @reg_escape('culong') def escape_culong(char: str, codec: str) -> str: """Escape scheme for four byte C Unicode escape sequences as defined by C17. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ x = ord(char) return f'\\U{x:08x}' @reg_escape('html') def escape_html(char: str, codec: str) -> str: """Escape scheme for HTML named character references. It will return the decimal numeric character references if no named entity exists. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ try: return cached_entities[char] except KeyError: return get_named_entity(char) @reg_escape('htmldec') def escape_htmldec(char: str, codec: str) -> str: """Escape scheme for HTML decimal numeric character references. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ n = ord(char) return f'&#{n};' @reg_escape('htmlhex') def escape_htmlhex(char: str, codec: str) -> str: """Escape scheme for HTML hexadecimal numeric character references. :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ n = ord(char) return f'&#x{n:x};' @reg_escape('java') def escape_java(char: str, codec: str) -> str: """Escape scheme for Java encoding, based on the Java SE Specification. The specification can be found `here.`_ .. _here: https://docs.oracle.com/javase/specs/jls/se20/html/jls-3.html :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ table = { '\u0008': r'\b', '\u0020': r'\s', '\u000c': r'\f', '\u000a': r'\n', '\u000d': r'\r', '\u0009': r'\t', '\u0027': r"\'", '\u0022': r'\"', '\u005c': r'\\', } try: return lookup_escape(char, table) except EscapeError: return escape_javao(char, codec) @reg_escape('javao') def escape_javao(char: str, codec: str) -> str: """Escape scheme for Java octal encoding, based on the Java SE Specification. The specification can be found `here.`_ .. _here: https://docs.oracle.com/javase/specs/jls/se20/html/jls-3.html :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ try: return octal_escape(char) except EscapeError: return escape_javau(char, codec) @reg_escape('javau') def escape_javau(char: str, codec: str) -> str: """Escape scheme for Java Unicode encoding, based on the Java SE Specification. The specification can be found `here.`_ .. _here: https://docs.oracle.com/javase/specs/jls/se20/html/jls-3.html :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ return unicode_utf16_escape(char) @reg_escape('js') def escape_js(char: str, codec: str) -> str: """Escape scheme for JavaScript encoding, based on the ECMA-262 Specification. The specification can be found `here.`_ .. _here: https://262.ecma-international.org/13.0/\ #sec-literals-string-literals :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ table = { '\u0008': r'\b', '\u0009': r'\t', '\u000a': r'\n', '\u000b': r'\v', '\u000c': r'\f', '\u000d': r'\r', '\u0022': r'\"', '\u005c': r'\\', } try: return lookup_escape(char, table) except EscapeError: return escape_jso(char, codec) @reg_escape('jso') def escape_jso(char: str, codec: str) -> str: """Escape scheme for JavaScript octal encoding, based on the ECMA-262 Specification. The specification can be found `here.`_ .. _here: https://262.ecma-international.org/13.0/\ #sec-literals-string-literals :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ try: return octal_escape(char) except EscapeError: return escape_jsu(char, codec) @reg_escape('jsu') def escape_jsu(char: str, codec: str) -> str: """Escape scheme for JavaScript unicode encoding, based on the ECMA-262 Specification. The specification can be found `here.`_ .. _here: https://262.ecma-international.org/13.0/\ #sec-literals-string-literals :param char: The character to escape. :param codec: Unused. :return: The escaped character as a :class:`str`. :rtype: str """ try: return unicode_2_byte_escape(char) except EscapeError: return escape_jscp(char, codec) @reg_escape('jscp') def escape_jscp(char: str, codec: str) -> str: """Escape scheme for JavaScript code point encoding, based on the ECMA-262 Specification. The specification can be found `here.`_ .. _here: https://262.ecma-international.org/13.0/\ #sec-literals-string-literals :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ x = ord(char) return f'\\u{{{x:x}}}' @reg_escape('json') def escape_json(char: str, codec: str) -> str: """Escape scheme for JSON encoding, based on the ECMA-404 Specification. The specification can be found `here.`_ .. _here: https://www.ecma-international.org/publications-and-standards/\ standards/ecma-404/ECMA-404_2nd_edition_december_2017.pdf :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ table = { '\u0022': r'\"', '\u005c': r'\\', '\u002f': r'\/', '\u0008': r'\b', '\u000c': r'\f', '\u000a': r'\n', '\u000d': r'\r', '\u0009': r'\t', } if char in table: return table[char] return escape_jsonu(char, codec) @reg_escape('jsonu') def escape_jsonu(char: str, codec: str) -> str: """Escape scheme for JSON Unicode encoding, based on the ECMA-404 Specification. The specification can be found `here.`_ .. _here: https://www.ecma-international.org/publications-and-standards/\ standards/ecma-404/ECMA-404_2nd_edition_december_2017.pdf :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ return unicode_utf16_escape(char) @reg_escape('smol') def escape_smol(char: str, codec: str) -> str: """Escape scheme for smol characters, based loosely on the Unicode superscript characters. :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ norms = 'abcdefghijklmnopqrstuvwxyz' smol = 'ᵃᵇᶜᵈᵉᶠᵍʰᶦʲᵏˡᵐⁿᵒᵖᑫʳˢᵗᵘᵛʷˣʸᶻ' table = {k: v for k, v in zip(norms, smol)} try: return lookup_escape(char, table) except EscapeError: return char @reg_escape('sql') def escape_sql(char: str, codec: str) -> str: """Escape scheme for MySQL encoding, based on the MySQL Specification. The specification can be found `here.`_ .. _here: https://dev.mysql.com/doc/refman/8.0/en/string-literals.html :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ table = { '\u0000': r'\0', '\u0027': r"\'", '\u0022': r'\"', '\u0008': r'\b', '\u000a': r'\n', '\u000d': r'\r', '\u0009': r'\t', '\u0026': r'\Z', '\u005c': r'\\', '\u0025': r'\%', '\u005f': r'\_', } try: return lookup_escape(char, table) except EscapeError: return char @reg_escape('sqldq') def escape_sqldq(char: str, codec: str) -> str: """Escape scheme for MySQL encoding, based on the MySQL Specification. This escapes qoutes by doubling them rather than using a backslash. The specification can be found `here.`_ .. _here: https://dev.mysql.com/doc/refman/8.0/en/string-literals.html :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ table = { '\u0000': r'\0', '\u0027': r"''", '\u0022': r'""', '\u0008': r'\b', '\u000a': r'\n', '\u000d': r'\r', '\u0009': r'\t', '\u0026': r'\Z', '\u005c': r'\\', '\u0025': r'\%', '\u005f': r'\_', } try: return lookup_escape(char, table) except EscapeError: return char @reg_escape('url') def escape_url(char: str, codec: str) -> str: """Escape scheme for URL percent encoding. :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ b = char.encode(codec) octets = [f'%{x:02x}'.upper() for x in b] return ''.join(x for x in octets) @reg_escape('zalgo') def escape_zalgo(char: str, codec: str) -> str: """Escape scheme for zalgo (glitch) text. :param char: The character to escape. :param codec: The character set to use when encoding the character. :return: The escaped character as a :class:`str`. :rtype: str """ marks = get_characters_in_block('Combining Diacritical Marks') num_marks = random.randint(1, 10) return char + ''.join(random.choice(marks) for _ in range(num_marks)) # Bulk escape. def escape(s: str, schemekey: str, codec: str = 'utf8') -> str: """Escape the string with the scheme. :param s: The string to escape. :param scheme: The key in the `schemes` :class:`dict` to use for the escaping. :param codec: The character set codec to use when escaping the characters. :return: The escaped :class:`str`. :rtype: str """ scheme = schemes[schemekey] return ''.join(scheme(char, codec) for char in s)