(Chilkat2-Python) Unicode Escape

Demonstrates options for unicode escaping non-us-ascii chars and emojis.

Note: This example requires Chilkat v11.1.0 or greater.

For more information, see https://www.chilkatsoft.com/unicode_escape.asp

Chilkat2 Python Downloads

install with pip
pip3 install chilkat2
or download... Python Module for Windows, Linux, Alpine Linux, MacOS

import sys
import chilkat2

success = False

sb = chilkat2.StringBuilder()
success = sb.LoadFile("qa_data/txt/utf16_emojis_accented_jap.txt","utf-16")
if (success == False):
    print(sb.LastErrorText)
    sys.exit()

original = sb.GetAsString()

# The above file contains the following text, which includes some emoji's,
# Japanese chars, and accented chars.

# 🧠
# 🔐
# ✅
# ⚠️
# ❌
# ✓
# 中
# é xyz à
# abc 私 は ん ghi

crypt = chilkat2.Crypt2()

# Charset is not used for unicode escaping.  Set it to "utf-8", but it means nothing.
charsetNotUsed = "utf-8"

# Indicate the desired format/style of Unicode escaping.
# Choose JSON-style (JavaScript-style) Unicode escape sequences by using "unicodeescape"
encoding = "unicodeescape"

escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# \ud83e\udde0
# \ud83d\udd10
# \u2705
# \u26a0\ufe0f
# \u274c
# \u2713
# \u4e2d
# \u00e9 xyz \u00e0
# abc \u79c1 \u306f \u3093 ghi

# Revert back to the unescaped chars:
unescaped = crypt.DecodeString(escaped,charsetNotUsed,encoding)
print(unescaped)

# -----------------------------------------------------------------------------------------
# Do the same, but use uppercase letters (A-F) in the hex values.
encoding = "unicodeescape-upper"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# \uD83E\uDDE0
# \uD83D\uDD10
# \u2705
# \u26A0\uFE0F
# \u274C
# \u2713
# \u4E2D
# \u00E9 xyz \u00E0
# abc \u79C1 \u306F \u3093 ghi

# Revert back to the unescaped chars:
unescaped = crypt.DecodeString(escaped,charsetNotUsed,encoding)
print(unescaped)

# -----------------------------------------------------------------------------------------
#  ECMAScript (JavaScript) “code point escape” syntax

encoding = "unicodeescape-curly"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# \u{d83e}\u{dde0}
# \u{d83d}\u{dd10}
# \u{2705}
# \u{26a0}\u{fe0f}
# \u{274c}
# \u{2713}
# \u{4e2d}
# \u{00e9} xyz \u{00e0}
# abc \u{79c1} \u{306f} \u{3093} ghi

# Revert back to the unescaped chars:
unescaped = crypt.DecodeString(escaped,charsetNotUsed,encoding)
print(unescaped)

# -----------------------------------------------------------------------------------------
# Do the same, but use uppercase letters (A-F) in the hex values.
encoding = "unicodeescape-curly-upper"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# \u{D83E}\u{DDE0}
# \u{D83D}\u{DD10}
# \u{2705}
# \u{26A0}\u{FE0F}
# \u{274C}
# \u{2713}
# \u{4E2D}
# \u{00E9} xyz \u{00E0}
# abc \u{79C1} \u{306F} \u{3093} ghi

# Revert back to the unescaped chars:
unescaped = crypt.DecodeString(escaped,charsetNotUsed,encoding)
print(unescaped)

# -----------------------------------------------------------------------------------------
# Unicode code point notation or U+ notation

encoding = "unicodeescape-plus"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# u+1f9e0
# u+1f510
# u+2705
# u+26a0u+fe0f
# u+274c
# u+2713
# u+4e2d
# u+00e9 xyz u+00e0
# abc u+79c1 u+306f u+3093 ghi

# Chilkat cannot unescape the Unicode code point notation or U+ notation.
# For this style, Chilkat only goes in one direction, which is to escape.

# To emit uppercase hex, specify unicodeescape-plus-upper
encoding = "unicodeescape-plus-upper"
# ...
# ...

# -----------------------------------------------------------------------------------------
# HTML hexadecimal character reference

encoding = "unicodeescape-htmlhex"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# &#x1f9e0;
# &#x1f510;
# &#x2705;
# &#x26a0;&#xfe0f;
# &#x274c;
# &#x2713;
# &#x4e2d;
# &#xe9; xyz &#xe0;
# abc &#x79c1; &#x306f; &#x3093; ghi

# Revert back to the unescaped chars:
unescaped = crypt.DecodeString(escaped,charsetNotUsed,encoding)
print(unescaped)

# -----------------------------------------------------------------------------------------
# HTML decimal character reference

encoding = "unicodeescape-htmldec"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# &#129504;
# &#128272;
# &#9989;
# &#9888;&#65039;
# &#10060;
# &#10003;
# &#20013;
# &#233; xyz &#224;
# abc &#31169; &#12399; &#12435; ghi

# Revert back to the unescaped chars:
unescaped = crypt.DecodeString(escaped,charsetNotUsed,encoding)
print(unescaped)

# -----------------------------------------------------------------------------------------
# Hex in Angle Brackets

encoding = "unicodeescape-angle"
escaped = crypt.EncodeString(original,charsetNotUsed,encoding)
print(escaped)

# Output:
# <1f9e0>
# <1f510>
# <2705>
# <26a0><fe0f>
# <274c>
# <2713>
# <4e2d>
# <e9> xyz <e0>
# abc <79c1> <306f> <3093> ghi

# Chilkat cannot unescape the angle bracket notation.
# For this style, Chilkat only goes in one direction, which is to escape.