mirror of
				https://github.com/Tencent/rapidjson.git
				synced 2025-11-04 12:17:41 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			433 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			433 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
// Copyright (C) 2011 Milo Yip
 | 
						|
//
 | 
						|
// Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
						|
// of this software and associated documentation files (the "Software"), to deal
 | 
						|
// in the Software without restriction, including without limitation the rights
 | 
						|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
						|
// copies of the Software, and to permit persons to whom the Software is
 | 
						|
// furnished to do so, subject to the following conditions:
 | 
						|
//
 | 
						|
// The above copyright notice and this permission notice shall be included in
 | 
						|
// all copies or substantial portions of the Software.
 | 
						|
//
 | 
						|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
						|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
						|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
						|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
						|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
						|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 | 
						|
// THE SOFTWARE.
 | 
						|
 | 
						|
#include "unittest.h"
 | 
						|
#include "rapidjson/filereadstream.h"
 | 
						|
#include "rapidjson/filewritestream.h"
 | 
						|
#include "rapidjson/encodedstream.h"
 | 
						|
#include "rapidjson/stringbuffer.h"
 | 
						|
 | 
						|
using namespace rapidjson;
 | 
						|
 | 
						|
// Verification of encoders/decoders with Hoehrmann's UTF8 decoder
 | 
						|
 | 
						|
// http://www.unicode.org/Public/UNIDATA/Blocks.txt
 | 
						|
static const unsigned kCodepointRanges[] = {
 | 
						|
    0x0000,     0x007F,     // Basic Latin
 | 
						|
    0x0080,     0x00FF,     // Latin-1 Supplement
 | 
						|
    0x0100,     0x017F,     // Latin Extended-A
 | 
						|
    0x0180,     0x024F,     // Latin Extended-B
 | 
						|
    0x0250,     0x02AF,     // IPA Extensions
 | 
						|
    0x02B0,     0x02FF,     // Spacing Modifier Letters
 | 
						|
    0x0300,     0x036F,     // Combining Diacritical Marks
 | 
						|
    0x0370,     0x03FF,     // Greek and Coptic
 | 
						|
    0x0400,     0x04FF,     // Cyrillic
 | 
						|
    0x0500,     0x052F,     // Cyrillic Supplement
 | 
						|
    0x0530,     0x058F,     // Armenian
 | 
						|
    0x0590,     0x05FF,     // Hebrew
 | 
						|
    0x0600,     0x06FF,     // Arabic
 | 
						|
    0x0700,     0x074F,     // Syriac
 | 
						|
    0x0750,     0x077F,     // Arabic Supplement
 | 
						|
    0x0780,     0x07BF,     // Thaana
 | 
						|
    0x07C0,     0x07FF,     // NKo
 | 
						|
    0x0800,     0x083F,     // Samaritan
 | 
						|
    0x0840,     0x085F,     // Mandaic
 | 
						|
    0x0900,     0x097F,     // Devanagari
 | 
						|
    0x0980,     0x09FF,     // Bengali
 | 
						|
    0x0A00,     0x0A7F,     // Gurmukhi
 | 
						|
    0x0A80,     0x0AFF,     // Gujarati
 | 
						|
    0x0B00,     0x0B7F,     // Oriya
 | 
						|
    0x0B80,     0x0BFF,     // Tamil
 | 
						|
    0x0C00,     0x0C7F,     // Telugu
 | 
						|
    0x0C80,     0x0CFF,     // Kannada
 | 
						|
    0x0D00,     0x0D7F,     // Malayalam
 | 
						|
    0x0D80,     0x0DFF,     // Sinhala
 | 
						|
    0x0E00,     0x0E7F,     // Thai
 | 
						|
    0x0E80,     0x0EFF,     // Lao
 | 
						|
    0x0F00,     0x0FFF,     // Tibetan
 | 
						|
    0x1000,     0x109F,     // Myanmar
 | 
						|
    0x10A0,     0x10FF,     // Georgian
 | 
						|
    0x1100,     0x11FF,     // Hangul Jamo
 | 
						|
    0x1200,     0x137F,     // Ethiopic
 | 
						|
    0x1380,     0x139F,     // Ethiopic Supplement
 | 
						|
    0x13A0,     0x13FF,     // Cherokee
 | 
						|
    0x1400,     0x167F,     // Unified Canadian Aboriginal Syllabics
 | 
						|
    0x1680,     0x169F,     // Ogham
 | 
						|
    0x16A0,     0x16FF,     // Runic
 | 
						|
    0x1700,     0x171F,     // Tagalog
 | 
						|
    0x1720,     0x173F,     // Hanunoo
 | 
						|
    0x1740,     0x175F,     // Buhid
 | 
						|
    0x1760,     0x177F,     // Tagbanwa
 | 
						|
    0x1780,     0x17FF,     // Khmer
 | 
						|
    0x1800,     0x18AF,     // Mongolian
 | 
						|
    0x18B0,     0x18FF,     // Unified Canadian Aboriginal Syllabics Extended
 | 
						|
    0x1900,     0x194F,     // Limbu
 | 
						|
    0x1950,     0x197F,     // Tai Le
 | 
						|
    0x1980,     0x19DF,     // New Tai Lue
 | 
						|
    0x19E0,     0x19FF,     // Khmer Symbols
 | 
						|
    0x1A00,     0x1A1F,     // Buginese
 | 
						|
    0x1A20,     0x1AAF,     // Tai Tham
 | 
						|
    0x1B00,     0x1B7F,     // Balinese
 | 
						|
    0x1B80,     0x1BBF,     // Sundanese
 | 
						|
    0x1BC0,     0x1BFF,     // Batak
 | 
						|
    0x1C00,     0x1C4F,     // Lepcha
 | 
						|
    0x1C50,     0x1C7F,     // Ol Chiki
 | 
						|
    0x1CD0,     0x1CFF,     // Vedic Extensions
 | 
						|
    0x1D00,     0x1D7F,     // Phonetic Extensions
 | 
						|
    0x1D80,     0x1DBF,     // Phonetic Extensions Supplement
 | 
						|
    0x1DC0,     0x1DFF,     // Combining Diacritical Marks Supplement
 | 
						|
    0x1E00,     0x1EFF,     // Latin Extended Additional
 | 
						|
    0x1F00,     0x1FFF,     // Greek Extended
 | 
						|
    0x2000,     0x206F,     // General Punctuation
 | 
						|
    0x2070,     0x209F,     // Superscripts and Subscripts
 | 
						|
    0x20A0,     0x20CF,     // Currency Symbols
 | 
						|
    0x20D0,     0x20FF,     // Combining Diacritical Marks for Symbols
 | 
						|
    0x2100,     0x214F,     // Letterlike Symbols
 | 
						|
    0x2150,     0x218F,     // Number Forms
 | 
						|
    0x2190,     0x21FF,     // Arrows
 | 
						|
    0x2200,     0x22FF,     // Mathematical Operators
 | 
						|
    0x2300,     0x23FF,     // Miscellaneous Technical
 | 
						|
    0x2400,     0x243F,     // Control Pictures
 | 
						|
    0x2440,     0x245F,     // Optical Character Recognition
 | 
						|
    0x2460,     0x24FF,     // Enclosed Alphanumerics
 | 
						|
    0x2500,     0x257F,     // Box Drawing
 | 
						|
    0x2580,     0x259F,     // Block Elements
 | 
						|
    0x25A0,     0x25FF,     // Geometric Shapes
 | 
						|
    0x2600,     0x26FF,     // Miscellaneous Symbols
 | 
						|
    0x2700,     0x27BF,     // Dingbats
 | 
						|
    0x27C0,     0x27EF,     // Miscellaneous Mathematical Symbols-A
 | 
						|
    0x27F0,     0x27FF,     // Supplemental Arrows-A
 | 
						|
    0x2800,     0x28FF,     // Braille Patterns
 | 
						|
    0x2900,     0x297F,     // Supplemental Arrows-B
 | 
						|
    0x2980,     0x29FF,     // Miscellaneous Mathematical Symbols-B
 | 
						|
    0x2A00,     0x2AFF,     // Supplemental Mathematical Operators
 | 
						|
    0x2B00,     0x2BFF,     // Miscellaneous Symbols and Arrows
 | 
						|
    0x2C00,     0x2C5F,     // Glagolitic
 | 
						|
    0x2C60,     0x2C7F,     // Latin Extended-C
 | 
						|
    0x2C80,     0x2CFF,     // Coptic
 | 
						|
    0x2D00,     0x2D2F,     // Georgian Supplement
 | 
						|
    0x2D30,     0x2D7F,     // Tifinagh
 | 
						|
    0x2D80,     0x2DDF,     // Ethiopic Extended
 | 
						|
    0x2DE0,     0x2DFF,     // Cyrillic Extended-A
 | 
						|
    0x2E00,     0x2E7F,     // Supplemental Punctuation
 | 
						|
    0x2E80,     0x2EFF,     // CJK Radicals Supplement
 | 
						|
    0x2F00,     0x2FDF,     // Kangxi Radicals
 | 
						|
    0x2FF0,     0x2FFF,     // Ideographic Description Characters
 | 
						|
    0x3000,     0x303F,     // CJK Symbols and Punctuation
 | 
						|
    0x3040,     0x309F,     // Hiragana
 | 
						|
    0x30A0,     0x30FF,     // Katakana
 | 
						|
    0x3100,     0x312F,     // Bopomofo
 | 
						|
    0x3130,     0x318F,     // Hangul Compatibility Jamo
 | 
						|
    0x3190,     0x319F,     // Kanbun
 | 
						|
    0x31A0,     0x31BF,     // Bopomofo Extended
 | 
						|
    0x31C0,     0x31EF,     // CJK Strokes
 | 
						|
    0x31F0,     0x31FF,     // Katakana Phonetic Extensions
 | 
						|
    0x3200,     0x32FF,     // Enclosed CJK Letters and Months
 | 
						|
    0x3300,     0x33FF,     // CJK Compatibility
 | 
						|
    0x3400,     0x4DBF,     // CJK Unified Ideographs Extension A
 | 
						|
    0x4DC0,     0x4DFF,     // Yijing Hexagram Symbols
 | 
						|
    0x4E00,     0x9FFF,     // CJK Unified Ideographs
 | 
						|
    0xA000,     0xA48F,     // Yi Syllables
 | 
						|
    0xA490,     0xA4CF,     // Yi Radicals
 | 
						|
    0xA4D0,     0xA4FF,     // Lisu
 | 
						|
    0xA500,     0xA63F,     // Vai
 | 
						|
    0xA640,     0xA69F,     // Cyrillic Extended-B
 | 
						|
    0xA6A0,     0xA6FF,     // Bamum
 | 
						|
    0xA700,     0xA71F,     // Modifier Tone Letters
 | 
						|
    0xA720,     0xA7FF,     // Latin Extended-D
 | 
						|
    0xA800,     0xA82F,     // Syloti Nagri
 | 
						|
    0xA830,     0xA83F,     // Common Indic Number Forms
 | 
						|
    0xA840,     0xA87F,     // Phags-pa
 | 
						|
    0xA880,     0xA8DF,     // Saurashtra
 | 
						|
    0xA8E0,     0xA8FF,     // Devanagari Extended
 | 
						|
    0xA900,     0xA92F,     // Kayah Li
 | 
						|
    0xA930,     0xA95F,     // Rejang
 | 
						|
    0xA960,     0xA97F,     // Hangul Jamo Extended-A
 | 
						|
    0xA980,     0xA9DF,     // Javanese
 | 
						|
    0xAA00,     0xAA5F,     // Cham
 | 
						|
    0xAA60,     0xAA7F,     // Myanmar Extended-A
 | 
						|
    0xAA80,     0xAADF,     // Tai Viet
 | 
						|
    0xAB00,     0xAB2F,     // Ethiopic Extended-A
 | 
						|
    0xABC0,     0xABFF,     // Meetei Mayek
 | 
						|
    0xAC00,     0xD7AF,     // Hangul Syllables
 | 
						|
    0xD7B0,     0xD7FF,     // Hangul Jamo Extended-B
 | 
						|
    //0xD800,       0xDB7F,     // High Surrogates
 | 
						|
    //0xDB80,       0xDBFF,     // High Private Use Surrogates
 | 
						|
    //0xDC00,       0xDFFF,     // Low Surrogates
 | 
						|
    0xE000,     0xF8FF,     // Private Use Area
 | 
						|
    0xF900,     0xFAFF,     // CJK Compatibility Ideographs
 | 
						|
    0xFB00,     0xFB4F,     // Alphabetic Presentation Forms
 | 
						|
    0xFB50,     0xFDFF,     // Arabic Presentation Forms-A
 | 
						|
    0xFE00,     0xFE0F,     // Variation Selectors
 | 
						|
    0xFE10,     0xFE1F,     // Vertical Forms
 | 
						|
    0xFE20,     0xFE2F,     // Combining Half Marks
 | 
						|
    0xFE30,     0xFE4F,     // CJK Compatibility Forms
 | 
						|
    0xFE50,     0xFE6F,     // Small Form Variants
 | 
						|
    0xFE70,     0xFEFF,     // Arabic Presentation Forms-B
 | 
						|
    0xFF00,     0xFFEF,     // Halfwidth and Fullwidth Forms
 | 
						|
    0xFFF0,     0xFFFF,     // Specials
 | 
						|
    0x10000,    0x1007F,    // Linear B Syllabary
 | 
						|
    0x10080,    0x100FF,    // Linear B Ideograms
 | 
						|
    0x10100,    0x1013F,    // Aegean Numbers
 | 
						|
    0x10140,    0x1018F,    // Ancient Greek Numbers
 | 
						|
    0x10190,    0x101CF,    // Ancient Symbols
 | 
						|
    0x101D0,    0x101FF,    // Phaistos Disc
 | 
						|
    0x10280,    0x1029F,    // Lycian
 | 
						|
    0x102A0,    0x102DF,    // Carian
 | 
						|
    0x10300,    0x1032F,    // Old Italic
 | 
						|
    0x10330,    0x1034F,    // Gothic
 | 
						|
    0x10380,    0x1039F,    // Ugaritic
 | 
						|
    0x103A0,    0x103DF,    // Old Persian
 | 
						|
    0x10400,    0x1044F,    // Deseret
 | 
						|
    0x10450,    0x1047F,    // Shavian
 | 
						|
    0x10480,    0x104AF,    // Osmanya
 | 
						|
    0x10800,    0x1083F,    // Cypriot Syllabary
 | 
						|
    0x10840,    0x1085F,    // Imperial Aramaic
 | 
						|
    0x10900,    0x1091F,    // Phoenician
 | 
						|
    0x10920,    0x1093F,    // Lydian
 | 
						|
    0x10A00,    0x10A5F,    // Kharoshthi
 | 
						|
    0x10A60,    0x10A7F,    // Old South Arabian
 | 
						|
    0x10B00,    0x10B3F,    // Avestan
 | 
						|
    0x10B40,    0x10B5F,    // Inscriptional Parthian
 | 
						|
    0x10B60,    0x10B7F,    // Inscriptional Pahlavi
 | 
						|
    0x10C00,    0x10C4F,    // Old Turkic
 | 
						|
    0x10E60,    0x10E7F,    // Rumi Numeral Symbols
 | 
						|
    0x11000,    0x1107F,    // Brahmi
 | 
						|
    0x11080,    0x110CF,    // Kaithi
 | 
						|
    0x12000,    0x123FF,    // Cuneiform
 | 
						|
    0x12400,    0x1247F,    // Cuneiform Numbers and Punctuation
 | 
						|
    0x13000,    0x1342F,    // Egyptian Hieroglyphs
 | 
						|
    0x16800,    0x16A3F,    // Bamum Supplement
 | 
						|
    0x1B000,    0x1B0FF,    // Kana Supplement
 | 
						|
    0x1D000,    0x1D0FF,    // Byzantine Musical Symbols
 | 
						|
    0x1D100,    0x1D1FF,    // Musical Symbols
 | 
						|
    0x1D200,    0x1D24F,    // Ancient Greek Musical Notation
 | 
						|
    0x1D300,    0x1D35F,    // Tai Xuan Jing Symbols
 | 
						|
    0x1D360,    0x1D37F,    // Counting Rod Numerals
 | 
						|
    0x1D400,    0x1D7FF,    // Mathematical Alphanumeric Symbols
 | 
						|
    0x1F000,    0x1F02F,    // Mahjong Tiles
 | 
						|
    0x1F030,    0x1F09F,    // Domino Tiles
 | 
						|
    0x1F0A0,    0x1F0FF,    // Playing Cards
 | 
						|
    0x1F100,    0x1F1FF,    // Enclosed Alphanumeric Supplement
 | 
						|
    0x1F200,    0x1F2FF,    // Enclosed Ideographic Supplement
 | 
						|
    0x1F300,    0x1F5FF,    // Miscellaneous Symbols And Pictographs
 | 
						|
    0x1F600,    0x1F64F,    // Emoticons
 | 
						|
    0x1F680,    0x1F6FF,    // Transport And Map Symbols
 | 
						|
    0x1F700,    0x1F77F,    // Alchemical Symbols
 | 
						|
    0x20000,    0x2A6DF,    // CJK Unified Ideographs Extension B
 | 
						|
    0x2A700,    0x2B73F,    // CJK Unified Ideographs Extension C
 | 
						|
    0x2B740,    0x2B81F,    // CJK Unified Ideographs Extension D
 | 
						|
    0x2F800,    0x2FA1F,    // CJK Compatibility Ideographs Supplement
 | 
						|
    0xE0000,    0xE007F,    // Tags
 | 
						|
    0xE0100,    0xE01EF,    // Variation Selectors Supplement
 | 
						|
    0xF0000,    0xFFFFF,    // Supplementary Private Use Area-A
 | 
						|
    0x100000,   0x10FFFF,   // Supplementary Private Use Area-B
 | 
						|
    0xFFFFFFFF
 | 
						|
};
 | 
						|
 | 
						|
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 | 
						|
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 | 
						|
 | 
						|
#define UTF8_ACCEPT 0u
 | 
						|
#define UTF8_REJECT 12u
 | 
						|
 | 
						|
static const unsigned char utf8d[] = {
 | 
						|
    // The first part of the table maps bytes to character classes that
 | 
						|
    // to reduce the size of the transition table and create bitmasks.
 | 
						|
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 | 
						|
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 | 
						|
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 | 
						|
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 | 
						|
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 | 
						|
    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 | 
						|
    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 | 
						|
    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
 | 
						|
 | 
						|
    // The second part is a transition table that maps a combination
 | 
						|
    // of a state of the automaton and a character class to a state.
 | 
						|
    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
 | 
						|
    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
 | 
						|
    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
 | 
						|
    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
 | 
						|
    12,36,12,12,12,12,12,12,12,12,12,12, 
 | 
						|
};
 | 
						|
 | 
						|
static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) {
 | 
						|
    unsigned type = utf8d[byte];
 | 
						|
 | 
						|
    *codep = (*state != UTF8_ACCEPT) ?
 | 
						|
        (byte & 0x3fu) | (*codep << 6) :
 | 
						|
    (0xff >> type) & (byte);
 | 
						|
 | 
						|
    *state = utf8d[256 + *state + type];
 | 
						|
    return *state;
 | 
						|
}
 | 
						|
 | 
						|
//static bool IsUTF8(unsigned char* s) {
 | 
						|
//  unsigned codepoint, state = 0;
 | 
						|
//
 | 
						|
//  while (*s)
 | 
						|
//      decode(&state, &codepoint, *s++);
 | 
						|
//
 | 
						|
//  return state == UTF8_ACCEPT;
 | 
						|
//}
 | 
						|
 | 
						|
TEST(EncodingsTest, UTF8) {
 | 
						|
    StringBuffer os, os2;
 | 
						|
    for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
 | 
						|
        for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
 | 
						|
            os.Clear();
 | 
						|
            UTF8<>::Encode(os, codepoint);
 | 
						|
            const char* encodedStr = os.GetString();
 | 
						|
 | 
						|
            // Decode with Hoehrmann
 | 
						|
            {
 | 
						|
                unsigned decodedCodepoint = 0;
 | 
						|
                unsigned state = 0;
 | 
						|
 | 
						|
                unsigned decodedCount = 0;
 | 
						|
                for (const char* s = encodedStr; *s; ++s)
 | 
						|
                    if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) {
 | 
						|
                        EXPECT_EQ(codepoint, decodedCodepoint);
 | 
						|
                        decodedCount++;
 | 
						|
                    }
 | 
						|
 | 
						|
                if (*encodedStr)                // This decoder cannot handle U+0000
 | 
						|
                    EXPECT_EQ(1u, decodedCount);    // Should only contain one code point
 | 
						|
 | 
						|
                EXPECT_EQ(UTF8_ACCEPT, state);
 | 
						|
                if (UTF8_ACCEPT != state)
 | 
						|
                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
 | 
						|
            }
 | 
						|
 | 
						|
            // Decode
 | 
						|
            {
 | 
						|
                StringStream is(encodedStr);
 | 
						|
                unsigned decodedCodepoint;
 | 
						|
                bool result = UTF8<>::Decode(is, &decodedCodepoint);
 | 
						|
                EXPECT_TRUE(result);
 | 
						|
                EXPECT_EQ(codepoint, decodedCodepoint);
 | 
						|
                if (!result || codepoint != decodedCodepoint)
 | 
						|
                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
 | 
						|
            }
 | 
						|
 | 
						|
            // Validate
 | 
						|
            {
 | 
						|
                StringStream is(encodedStr);
 | 
						|
                os2.Clear();
 | 
						|
                bool result = UTF8<>::Validate(is, os2);
 | 
						|
                EXPECT_TRUE(result);
 | 
						|
                EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
TEST(EncodingsTest, UTF16) {
 | 
						|
    GenericStringBuffer<UTF16<> > os, os2;
 | 
						|
    GenericStringBuffer<UTF8<> > utf8os;
 | 
						|
    for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
 | 
						|
        for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
 | 
						|
            os.Clear();
 | 
						|
            UTF16<>::Encode(os, codepoint);
 | 
						|
            const UTF16<>::Ch* encodedStr = os.GetString();
 | 
						|
 | 
						|
            // Encode with Hoehrmann's code
 | 
						|
            if (codepoint != 0) // cannot handle U+0000
 | 
						|
            {
 | 
						|
                // encode with UTF8<> first
 | 
						|
                utf8os.Clear();
 | 
						|
                UTF8<>::Encode(utf8os, codepoint);
 | 
						|
 | 
						|
                // transcode from UTF8 to UTF16 with Hoehrmann's code
 | 
						|
                unsigned decodedCodepoint = 0;
 | 
						|
                unsigned state = 0;
 | 
						|
                UTF16<>::Ch buffer[3], *p = &buffer[0];
 | 
						|
                for (const char* s = utf8os.GetString(); *s; ++s) {
 | 
						|
                    if (!decode(&state, &decodedCodepoint, (unsigned char)*s))
 | 
						|
                        break;
 | 
						|
                }
 | 
						|
 | 
						|
                if (codepoint <= 0xFFFF)
 | 
						|
                    *p++ = static_cast<UTF16<>::Ch>(decodedCodepoint);
 | 
						|
                else {
 | 
						|
                    // Encode code points above U+FFFF as surrogate pair.
 | 
						|
                    *p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10));
 | 
						|
                    *p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF));
 | 
						|
                }
 | 
						|
                *p++ = '\0';
 | 
						|
 | 
						|
                EXPECT_EQ(0, StrCmp(buffer, encodedStr));
 | 
						|
            }
 | 
						|
 | 
						|
            // Decode
 | 
						|
            {
 | 
						|
                GenericStringStream<UTF16<> > is(encodedStr);
 | 
						|
                unsigned decodedCodepoint;
 | 
						|
                bool result = UTF16<>::Decode(is, &decodedCodepoint);
 | 
						|
                EXPECT_TRUE(result);
 | 
						|
                EXPECT_EQ(codepoint, decodedCodepoint);         
 | 
						|
                if (!result || codepoint != decodedCodepoint)
 | 
						|
                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
 | 
						|
            }
 | 
						|
 | 
						|
            // Validate
 | 
						|
            {
 | 
						|
                GenericStringStream<UTF16<> > is(encodedStr);
 | 
						|
                os2.Clear();
 | 
						|
                bool result = UTF16<>::Validate(is, os2);
 | 
						|
                EXPECT_TRUE(result);
 | 
						|
                EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
TEST(EncodingsTest, UTF32) {
 | 
						|
    GenericStringBuffer<UTF32<> > os, os2;
 | 
						|
    for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
 | 
						|
        for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
 | 
						|
            os.Clear();
 | 
						|
            UTF32<>::Encode(os, codepoint);
 | 
						|
            const UTF32<>::Ch* encodedStr = os.GetString();
 | 
						|
 | 
						|
            // Decode
 | 
						|
            {
 | 
						|
                GenericStringStream<UTF32<> > is(encodedStr);
 | 
						|
                unsigned decodedCodepoint;
 | 
						|
                bool result = UTF32<>::Decode(is, &decodedCodepoint);
 | 
						|
                EXPECT_TRUE(result);
 | 
						|
                EXPECT_EQ(codepoint, decodedCodepoint);         
 | 
						|
                if (!result || codepoint != decodedCodepoint)
 | 
						|
                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
 | 
						|
            }
 | 
						|
 | 
						|
            // Validate
 | 
						|
            {
 | 
						|
                GenericStringStream<UTF32<> > is(encodedStr);
 | 
						|
                os2.Clear();
 | 
						|
                bool result = UTF32<>::Validate(is, os2);
 | 
						|
                EXPECT_TRUE(result);
 | 
						|
                EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 |