|
|
Ruby String Processing - Utf-8, Unicode, ANSI, 8-bit, etc.
Sample Ruby script showing the capabilities of Chilkat's string object for the Ruby programming language. The CkString object can be used freely and does not require the purchase of a license.
Download Ruby Programming Example Scripts
# file: StringExample1.rb
# Ruby doesn't help much regarding charsets, character encodings,
# Unicode, and internationalization.
# The CkString object is designed to solve just
# about any internationalized string processing issue
# you might have with Ruby.
require 'chilkat'
# Create a new instance of CkString
# The string we will be working with contains both
# 7bit us-ascii characters as well as some common
# 8bit characters found in French, Spanish, and other
# European languages.
strObj = Chilkat::CkString.new()
strObj.appendAnsi('abc eèéêë')
# Assign rubyStr to the ANSI bytes.
rubyStr = strObj.getAnsi()
# Check the bytes of our string.
# if we do this:
rubyStr.each_byte {|c| print c, ' ' }
print "\n"
# This is printed to the console:
# 97 98 99 32 101 232 233 234 235
# This is all correct. We see the SPACE character is 32.
# The 8-bit characters are 232, 233, 234 and 235.
# We can verify against the character code chart here:
# http://www.microsoft.com/globaldev/reference/iso/28591.mspx
# The è character is 0xE8, which is 232 in decimal.
# If running from a DOS prompt, the DOS console window
# will not display the characters properly. This is a problem
# with the MS-DOS console and not Ruby. Redirect output
# to a file and then view the output in an editor...
print rubyStr + "\n"
# Let's verify that our CkString object actually holds the characters we expect.
# Examine the chars in different 7-bit encodings:
# First try URL-encoding...
# Convert the string (in-place) to a URL-encoded string using the iso-8859-1 (a.k.a latin-1) encoding, which is a 1-byte/char encoding.
strObj.urlEncode("iso-8859-1")
# The output is: abc+e%E8%E9%EA%EB
# Notice that each 8-bit character is a single byte.
print "URL Encoded (iso-8859-1) = " + strObj.getAnsi() + "\n"
# Restore our string.
strObj.urlDecode("iso-8859-1")
# Let's see how the same string looks URL-encoded, but using the utf-8 character encoding:
strObj.urlEncode("utf-8")
# The output is: abc+e%C3%A8%C3%A9%C3%AA%C3%AB
# Notice that with utf-8, each 8-bit character uses 2 bytes, but the 7-bit us-ascii chars are still a single byte each.
print "URL Encoded (utf-8) = " + strObj.getAnsi() + "\n"
strObj.urlDecode("utf-8")
# Examine the string with 8-bit chars replaced by HTML entities:
# we get: abc eèéêë
strObj.entityEncode()
print "HTML entity encoded = " + strObj.getAnsi() + "\n"
strObj.entityDecode()
# Saving a string to a file in any character encoding is easy:
strObj.saveToFile('strAnsi.txt','iso-8859-1')
strObj.saveToFile('strUtf8.txt','utf-8')
# Let's create some objects for the examples that follow:
strObj.setStringAnsi('eè abc eèéêë')
strObj1 = Chilkat::CkString.new()
strObj1.appendAnsi('eèé')
strObj2 = Chilkat::CkString.new()
strObj2.appendAnsi('eè')
strObj3 = Chilkat::CkString.new()
strObj3.appendAnsi('eè abc')
strObj4 = Chilkat::CkString.new()
strObj4.appendAnsi('xyz')
strObj5 = Chilkat::CkString.new()
strObj5.appendAnsi('êë')
strObjMatch = Chilkat::CkString.new()
strObjMatch.appendAnsi('eè*abc*ë')
# When converting to uppercase/lowercase, most
# systems fail to handle 8bit characters (èéêë)
# Not CkString...
strObj.toUpperCase()
# Prints EÈ ABC EÈÉÊË
print strObj.getAnsi() + "\n"
# Back to lowercase and print...
strObj.toLowerCase()
# Prints eè abc eèéêë
print strObj.getAnsi() + "\n"
# We'll now demonstrate a few string operations...
if strObj.endsWith('éêë') then
print "endsWith worked!\n"
end
if strObj.endsWithStr(strObj5) then
print "endsWithStr worked!\n"
end
if strObj.beginsWithStr(strObj3) then
print "beginsWithStr worked!\n"
end
# The indexOf and indexOfStr methods return the character position
# (not byte position) of the first substring match in a string.
# 0 is the 1st char position, returns -1 if no match found...
idx = strObj.indexOf('èé')
print "indexOf = " + String(idx) + "\n"
idx = strObj.indexOfStr(strObj1)
print "indexOf = " + String(idx) + "\n"
# Replacing sub-strings
strTemp = strObj.clone()
strTemp.replaceAllOccurances('eè','XYZ')
# Prints XYZ abc XYZéêë
print strTemp.getAnsi() + "\n"
# Replace the first occurance of a substring.
strTemp = strObj.clone()
strTemp.replaceFirstOccurance('eè','XYZ')
# Prints XYZ abc eèéêë
print strTemp.getAnsi() + "\n"
# Get a substring by starting position and char count (not byte count)
subStr = strObj.substring(8,4)
# Prints èéêë
print subStr.getAnsi() + "\n"
# The CkString object provides simple string matching.
# The asterisk character '*' can be used to match 0 or more of any character.
if strObj.matches("*èé*") then
print "matches succeeded!\n"
end
if strObj.matchesStr(strObjMatch) then
print "matchesStr succeeded!\n"
end
# The getChar method returns a CkString, not a byte. This is because
# a single char may not be encoded as a single byte. For example, there are no
# Chinese, Japanese, or Korean characters encoded as a single byte in any encoding...
singleChar = strObj.getChar(8)
# Prints è
print singleChar.getAnsi() + "\n"
# Chop a string at the 1st occurance of a substring.
strTemp = strObj.clone()
strTemp.chopAtStr(strObj5)
# Prints eè abc eèé
print strTemp.getAnsi() + "\n"
# Encode the string in hex, base64, or quoted-printable:
# First quoted-printable in iso-8859-1 or utf-8
strObj.qpEncode('iso-8859-1')
# Prints e=E8 abc e=E8=E9=EA=EB
print strObj.getAnsi() + "\n"
strObj.qpDecode('iso-8859-1')
strObj.qpEncode('utf-8')
# Prints e=C3=A8 abc e=C3=A8=C3=A9=C3=AA=C3=AB
print strObj.getAnsi() + "\n"
strObj.qpDecode('utf-8')
# Now hex in iso-8859-1, utf-8, or ucs-2 (2-byte/char Unicode)
strObj.hexEncode('iso-8859-1')
# Prints 65E8206162632065E8E9EAEB
print strObj.getAnsi() + "\n"
strObj.hexDecode('iso-8859-1')
strObj.hexEncode('utf-8')
# Prints 65C3A8206162632065C3A8C3A9C3AAC3AB
print strObj.getAnsi() + "\n"
strObj.hexDecode('utf-8')
strObj.hexEncode('ucs-2')
# Prints 6500E800200061006200630020006500E800E900EA00EB00
print strObj.getAnsi() + "\n"
strObj.hexDecode('ucs-2')
# How many bytes in various encodings?
print "size in bytes for utf-8: " + String(strObj.getSizeUtf8()) + "\n"
print "size in bytes for ANSI: " + String(strObj.getSizeAnsi()) + "\n"
print "number of characters: " + String(strObj.getNumChars()) + "\n"
# Trim whitespace from both ends of the string, not including newlines
strObj.trim()
# Trim whitespace from both ends of the string, including newlines
strObj.trim2()
# Convert all line endings to CRLF
strObj.toCRLF()
# Convert all line endings to bare linefeeds
strObj.toLF()
# Shorten the string by N characters (not bytes)
strTemp = strObj.clone()
strTemp.shorten(5)
print "number of characters: " + String(strTemp.getNumChars()) + "\n"
# String equality
if strObj.equals('abc') then
print "Equals abc!"
end
if strObj.equalsIgnoreCase('Abc') then
print "Equals Abc!"
end
# Remove N characters from anywhere in the string.
charPosition = 3
numChars = 4
strTemp = strObj.clone()
# This remove 4 characters beginning with the 4th character in the string
#(the first char is at position 0) Units are in characters, not bytes.
strTemp.removeChunk(charPosition,numChars)
# Get the string in any character encoding
rubyStrUtf8 = strObj.getEnc('utf-8')
# prints 101 195 168 32 97 98 99 32 101 195 168 195 169 195 170 195 171
rubyStrUtf8.each_byte {|c| print c, ' ' }
print "\n"
rubyStrEbcdic = strObj.getEnc('ebcdic')
# prints 133 84 64 129 130 131 64 133 84 81 82 83
rubyStrEbcdic.each_byte {|c| print c, ' ' }
print "\n"
|