(PureBasic) Unicode Escape and Unescape Text in StringBuilder

Demonstrates options for unicode escaping non-us-ascii chars and emojis.

Note: This example requires Chilkat v11.1.0 or greater.

For more information, see https://www.chilkatsoft.com/unicode_escape.asp

Chilkat PureBasic Module Download

Chilkat PureBasic Module

IncludeFile "CkStringBuilder.pb"

Procedure ChilkatExample()

    success.i = 0

    sbOriginal.i = CkStringBuilder::ckCreate()
    If sbOriginal.i = 0
        Debug "Failed to create object."
        ProcedureReturn
    EndIf

    success = CkStringBuilder::ckLoadFile(sbOriginal,"qa_data/txt/utf16_emojis_accented_jap.txt","utf-16")
    If success = 0
        Debug CkStringBuilder::ckLastErrorText(sbOriginal)
        CkStringBuilder::ckDispose(sbOriginal)
        ProcedureReturn
    EndIf

    ; The above file contains the following text, which includes some emoji's,
    ; Japanese chars, and accented chars.

    sb.i = CkStringBuilder::ckCreate()
    If sb.i = 0
        Debug "Failed to create object."
        ProcedureReturn
    EndIf

    CkStringBuilder::ckAppendSb(sb,sbOriginal)

    ; Charset is not used for unicode escaping.  Set it to "utf-8", but it means nothing.
    charsetNotUsed.s = "utf-8"

    ; Indicate the desired format/style of Unicode escaping.
    ; Choose JSON-style (JavaScript-style) Unicode escape sequences by using "unicodeescape"
    encoding.s = "unicodeescape"

    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; \ud83e\udde0
    ; \ud83d\udd10
    ; \u2705
    ; \u26a0\ufe0f
    ; \u274c
    ; \u2713
    ; \u4e2d
    ; \u00e9 xyz \u00e0
    ; abc \u79c1 \u306f \u3093 ghi

    ; Revert back to the unescaped chars:
    CkStringBuilder::ckDecode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; -----------------------------------------------------------------------------------------
    ; Do the same, but use uppercase letters (A-F) in the hex values.
    encoding = "unicodeescape-upper"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; \uD83E\uDDE0
    ; \uD83D\uDD10
    ; \u2705
    ; \u26A0\uFE0F
    ; \u274C
    ; \u2713
    ; \u4E2D
    ; \u00E9 xyz \u00E0
    ; abc \u79C1 \u306F \u3093 ghi

    ; Revert back to the unescaped chars:
    CkStringBuilder::ckDecode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; -----------------------------------------------------------------------------------------
    ;  ECMAScript (JavaScript) �code point escape� syntax

    encoding = "unicodeescape-curly"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; \u{d83e}\u{dde0}
    ; \u{d83d}\u{dd10}
    ; \u{2705}
    ; \u{26a0}\u{fe0f}
    ; \u{274c}
    ; \u{2713}
    ; \u{4e2d}
    ; \u{00e9} xyz \u{00e0}
    ; abc \u{79c1} \u{306f} \u{3093} ghi

    ; Revert back to the unescaped chars:
    CkStringBuilder::ckDecode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; -----------------------------------------------------------------------------------------
    ; Do the same, but use uppercase letters (A-F) in the hex values.
    encoding = "unicodeescape-curly-upper"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; \u{D83E}\u{DDE0}
    ; \u{D83D}\u{DD10}
    ; \u{2705}
    ; \u{26A0}\u{FE0F}
    ; \u{274C}
    ; \u{2713}
    ; \u{4E2D}
    ; \u{00E9} xyz \u{00E0}
    ; abc \u{79C1} \u{306F} \u{3093} ghi

    ; Revert back to the unescaped chars:
    CkStringBuilder::ckDecode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; -----------------------------------------------------------------------------------------
    ; HTML hexadecimal character reference

    encoding = "unicodeescape-htmlhex"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; &#x1f9e0;
    ; &#x1f510;
    ; &#x2705;
    ; &#x26a0;&#xfe0f;
    ; &#x274c;
    ; &#x2713;
    ; &#x4e2d;
    ; &#xe9; xyz &#xe0;
    ; abc &#x79c1; &#x306f; &#x3093; ghi

    ; Revert back to the unescaped chars:
    CkStringBuilder::ckDecode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; -----------------------------------------------------------------------------------------
    ; HTML decimal character reference

    encoding = "unicodeescape-htmldec"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; &#129504;
    ; &#128272;
    ; &#9989;
    ; &#9888;&#65039;
    ; &#10060;
    ; &#10003;
    ; &#20013;
    ; &#233; xyz &#224;
    ; abc &#31169; &#12399; &#12435; ghi

    ; Revert back to the unescaped chars:
    CkStringBuilder::ckDecode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; -----------------------------------------------------------------------------------------
    ; Unicode code point notation or U+ notation

    encoding = "unicodeescape-plus"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; u+1f9e0
    ; u+1f510
    ; u+2705
    ; u+26a0u+fe0f
    ; u+274c
    ; u+2713
    ; u+4e2d
    ; u+00e9 xyz u+00e0
    ; abc u+79c1 u+306f u+3093 ghi

    ; Chilkat cannot unescape the Unicode code point notation or U+ notation.
    ; For this style, Chilkat only goes in one direction, which is to escape.

    ; To emit uppercase hex, specify unicodeescape-plus-upper
    encoding = "unicodeescape-plus-upper"
    ; ...
    ; ...

    CkStringBuilder::ckClear(sb)
    CkStringBuilder::ckAppendSb(sb,sbOriginal)

    ; -----------------------------------------------------------------------------------------
    ; Hex in Angle Brackets

    encoding = "unicodeescape-angle"
    CkStringBuilder::ckEncode(sb,encoding,charsetNotUsed)
    Debug CkStringBuilder::ckGetAsString(sb)

    ; Output:
    ; <1f9e0>
    ; <1f510>
    ; <2705>
    ; <26a0><fe0f>
    ; <274c>
    ; <2713>
    ; <4e2d>
    ; <e9> xyz <e0>
    ; abc <79c1> <306f> <3093> ghi

    ; Chilkat cannot unescape the angle bracket notation.
    ; For this style, Chilkat only goes in one direction, which is to escape.

    CkStringBuilder::ckClear(sb)
    CkStringBuilder::ckAppendSb(sb,sbOriginal)


    CkStringBuilder::ckDispose(sbOriginal)
    CkStringBuilder::ckDispose(sb)


    ProcedureReturn
EndProcedure