|
|
(JavaScript) Convert utf-8 Text File to Windows-1252
Demonstrates how to convert a text file using the utf-8 byte representation to windows-1252. Note: This example requires Chilkat v11.0.0 or greater.
var success = false;
// Converts a file containing the following to windows-1252:
// <greetings>
// <message>Hello, world!</message>
// <message>¡Hola, mundo!</message>
// <message>Bonjour, le monde!</message>
// <message>Hallo, Welt!</message>
// <message>Olá, mundo!</message>
// <message>Привет, мир!</message>
// <message>你好,世界!</message>
// <message>こんにちは、世界!</message>
// <message>안녕하세요, 세계!</message>
// <message>😊🌍</message>
// </greetings>
// --------------------------------------------------------------------------------------------------------------------------
// Note:
// Windows-1252 is an 8-bit single-byte encoding. It can only encode:
//
// The basic ASCII set (0x00–0x7F).
// Latin-1 Supplement (0xA0–0xFF), plus some extra printable characters (like curly quotes, €, etc.).
// In total: 256 possible code points, covering most Western European languages but nothing outside of Latin script.
// --------------------------------------------------------------------------------------------------------------------------
// Characters in your XML that are representable
//
// Hello, world! ✅ (ASCII only)
// ¡Hola, mundo! ✅ (inverted exclamation mark U+00A1 is in Windows-1252)
// Bonjour, le monde! ✅
// Hallo, Welt! ✅
// Olá, mundo! ✅ (U+00E1 á and U+00F3 ó are in Windows-1252)
// --------------------------------------------------------------------------------------------------------------------------
// Characters that break conversion
//
// Russian / Cyrillic: Привет, мир!
// → These are Cyrillic characters (U+041F … U+0440). Not representable in Windows-1252. Conversion would require replacement (e.g. with ? or XML character references).
// Chinese: 你好,世界!
// → CJK ideographs (U+4F60, U+597D, etc.). Not in Windows-1252.
// Japanese: こんにちは、世界!
// → Hiragana + CJK. Not in Windows-1252.
// Korean: 안녕하세요, 세계!
// → Hangul syllables. Not in Windows-1252.
// Emoji: 😊🌍
// → Unicode Supplementary Multilingual Plane (U+1F60A, U+1F30D). Windows-1252 cannot encode any emoji.
var bd = new CkBinData();
// Load the utf-8 bytes.
success = bd.LoadFile("qa_data/xml/utf8test.xml");
if (success == false) {
console.log(bd.LastErrorText);
return;
}
// If allOrNone = true, then the conversion fails and the contents of the BinData
// are left unchanged if any char is unconvertable.
// If allOrNone = false, then non-convertable chars are discarded.
var allOrNone = false;
var fromCharset = "utf-8";
var toCharset = "windows-1252";
success = bd.CharsetConvert(fromCharset,toCharset,allOrNone);
// The return value will be false if any utf-8 chars were discarded because of non-convertability.
if (success == false) {
console.log("Some utf-8 chars could not be converted to windows-1252");
}
else {
console.log("All utf-8 chars were converted to windows-1252");
}
success = bd.WriteFile("c:/temp/qa_output/out.xml");
// The output file contains the following, where all non-convertable chars were discarded
// <greetings>
// <message>Hello, world!</message>
// <message>¡Hola, mundo!</message>
// <message>Bonjour, le monde!</message>
// <message>Hallo, Welt!</message>
// <message>Olá, mundo!</message>
// <message>, !</message>
// <message></message>
// <message></message>
// <message>, !</message>
// <message></message>
// </greetings>
|