Implement wide->UTF-8 string conversion more correctly
This commit is contained in:
		| @@ -225,6 +225,12 @@ | ||||
| #include <sys/mman.h> | ||||
| #endif  // GTEST_HAS_STD_STRING && defined(GTEST_OS_LINUX) | ||||
|  | ||||
| // Determines whether the system compiler uses UTF-16 for encoding wide strings. | ||||
| #if defined(GTEST_OS_WINDOWS) || defined(GTEST_OS_CYGWIN) || \ | ||||
|         defined(__SYMBIAN32__) | ||||
| #define GTEST_WIDE_STRING_USES_UTF16_ 1 | ||||
| #endif | ||||
|  | ||||
| // Defines some utility macros. | ||||
|  | ||||
| // The GNU compiler emits a warning if nested "if" statements are followed by | ||||
|   | ||||
| @@ -133,8 +133,30 @@ class GTestFlagSaver { | ||||
|   internal::Int32 repeat_; | ||||
| } GTEST_ATTRIBUTE_UNUSED; | ||||
|  | ||||
| // Converts a Unicode code-point to its UTF-8 encoding. | ||||
| String ToUtf8String(wchar_t wchar); | ||||
| // Converts a Unicode code point to a narrow string in UTF-8 encoding. | ||||
| // code_point parameter is of type UInt32 because wchar_t may not be | ||||
| // wide enough to contain a code point. | ||||
| // The output buffer str must containt at least 32 characters. | ||||
| // The function returns the address of the output buffer. | ||||
| // If the code_point is not a valid Unicode code point | ||||
| // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output | ||||
| // as '(Invalid Unicode 0xXXXXXXXX)'. | ||||
| char* CodePointToUtf8(UInt32 code_point, char* str); | ||||
|  | ||||
| // Converts a wide string to a narrow string in UTF-8 encoding. | ||||
| // The wide string is assumed to have the following encoding: | ||||
| //   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) | ||||
| //   UTF-32 if sizeof(wchar_t) == 4 (on Linux) | ||||
| // Parameter str points to a null-terminated wide string. | ||||
| // Parameter num_chars may additionally limit the number | ||||
| // of wchar_t characters processed. -1 is used when the entire string | ||||
| // should be processed. | ||||
| // If the string contains code points that are not valid Unicode code points | ||||
| // (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output | ||||
| // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding | ||||
| // and contains invalid UTF-16 surrogate pairs, values in those pairs | ||||
| // will be encoded as individual Unicode characters from Basic Normal Plane. | ||||
| String WideStringToUtf8(const wchar_t* str, int num_chars); | ||||
|  | ||||
| // Returns the number of active threads, or 0 when there is an error. | ||||
| size_t GetThreadCount(); | ||||
|   | ||||
							
								
								
									
										153
									
								
								src/gtest.cc
									
									
									
									
									
								
							
							
						
						
									
										153
									
								
								src/gtest.cc
									
									
									
									
									
								
							| @@ -784,16 +784,19 @@ bool String::CStringEquals(const char * lhs, const char * rhs) { | ||||
| // encoding, and streams the result to the given Message object. | ||||
| static void StreamWideCharsToMessage(const wchar_t* wstr, size_t len, | ||||
|                                      Message* msg) { | ||||
|   for (size_t i = 0; i != len; i++) { | ||||
|   // TODO(wan): consider allowing a testing::String object to | ||||
|   // contain '\0'.  This will make it behave more like std::string, | ||||
|   // and will allow ToUtf8String() to return the correct encoding | ||||
|   // for '\0' s.t. we can get rid of the conditional here (and in | ||||
|   // several other places). | ||||
|     if (wstr[i]) { | ||||
|       *msg << internal::ToUtf8String(wstr[i]); | ||||
|   for (size_t i = 0; i != len; ) {  // NOLINT | ||||
|     if (wstr[i] != L'\0') { | ||||
|       *msg << WideStringToUtf8(wstr + i, len - i); | ||||
|       while (i != len && wstr[i] != L'\0') | ||||
|         i++; | ||||
|     } else { | ||||
|       *msg << '\0'; | ||||
|       i++; | ||||
|     } | ||||
|   } | ||||
| } | ||||
| @@ -852,8 +855,10 @@ String FormatForFailureMessage(wchar_t wchar) { | ||||
|   Message msg; | ||||
|   // A String object cannot contain '\0', so we print "\\0" when wchar is | ||||
|   // L'\0'. | ||||
|   msg << "L'" << (wchar ? ToUtf8String(wchar).c_str() : "\\0") << "' (" | ||||
|       << wchar_as_uint64 << ", 0x" << ::std::setbase(16) | ||||
|   char buffer[32];  // CodePointToUtf8 requires a buffer that big. | ||||
|   msg << "L'" | ||||
|       << (wchar ? CodePointToUtf8(static_cast<UInt32>(wchar), buffer) : "\\0") | ||||
|       << "' (" << wchar_as_uint64 << ", 0x" << ::std::setbase(16) | ||||
|       << wchar_as_uint64 << ")"; | ||||
|   return msg.GetString(); | ||||
| } | ||||
| @@ -1317,31 +1322,118 @@ inline UInt32 ChopLowBits(UInt32* bits, int n) { | ||||
|   return low_bits; | ||||
| } | ||||
|  | ||||
| // Converts a Unicode code-point to its UTF-8 encoding. | ||||
| String ToUtf8String(wchar_t wchar) { | ||||
|   char str[5] = {};  // Initializes str to all '\0' characters. | ||||
|  | ||||
|   UInt32 code = static_cast<UInt32>(wchar); | ||||
|   if (code <= kMaxCodePoint1) { | ||||
|     str[0] = static_cast<char>(code);                          // 0xxxxxxx | ||||
|   } else if (code <= kMaxCodePoint2) { | ||||
|     str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx | ||||
|     str[0] = static_cast<char>(0xC0 | code);                   // 110xxxxx | ||||
|   } else if (code <= kMaxCodePoint3) { | ||||
|     str[2] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx | ||||
|     str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx | ||||
|     str[0] = static_cast<char>(0xE0 | code);                   // 1110xxxx | ||||
|   } else if (code <= kMaxCodePoint4) { | ||||
|     str[3] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx | ||||
|     str[2] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx | ||||
|     str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6));  // 10xxxxxx | ||||
|     str[0] = static_cast<char>(0xF0 | code);                   // 11110xxx | ||||
| // Converts a Unicode code point to a narrow string in UTF-8 encoding. | ||||
| // code_point parameter is of type UInt32 because wchar_t may not be | ||||
| // wide enough to contain a code point. | ||||
| // The output buffer str must containt at least 32 characters. | ||||
| // The function returns the address of the output buffer. | ||||
| // If the code_point is not a valid Unicode code point | ||||
| // (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output | ||||
| // as '(Invalid Unicode 0xXXXXXXXX)'. | ||||
| char* CodePointToUtf8(UInt32 code_point, char* str) { | ||||
|   if (code_point <= kMaxCodePoint1) { | ||||
|     str[1] = '\0'; | ||||
|     str[0] = static_cast<char>(code_point);                          // 0xxxxxxx | ||||
|   } else if (code_point <= kMaxCodePoint2) { | ||||
|     str[2] = '\0'; | ||||
|     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx | ||||
|     str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx | ||||
|   } else if (code_point <= kMaxCodePoint3) { | ||||
|     str[3] = '\0'; | ||||
|     str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx | ||||
|     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx | ||||
|     str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx | ||||
|   } else if (code_point <= kMaxCodePoint4) { | ||||
|     str[4] = '\0'; | ||||
|     str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx | ||||
|     str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx | ||||
|     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx | ||||
|     str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx | ||||
|   } else { | ||||
|     return String::Format("(Invalid Unicode 0x%llX)", | ||||
|                           static_cast<UInt64>(wchar)); | ||||
|     // The longest string String::Format can produce when invoked | ||||
|     // with these parameters is 28 character long (not including | ||||
|     // the terminating nul character). We are asking for 32 character | ||||
|     // buffer just in case. This is also enough for strncpy to | ||||
|     // null-terminate the destination string. | ||||
|     // MSVC 8 deprecates strncpy(), so we want to suppress warning | ||||
|     // 4996 (deprecated function) there. | ||||
| #ifdef GTEST_OS_WINDOWS  // We are on Windows. | ||||
| #pragma warning(push)          // Saves the current warning state. | ||||
| #pragma warning(disable:4996)  // Temporarily disables warning 4996. | ||||
| #endif | ||||
|     strncpy(str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(), | ||||
|             32); | ||||
| #ifdef GTEST_OS_WINDOWS  // We are on Windows. | ||||
| #pragma warning(pop)           // Restores the warning state. | ||||
| #endif | ||||
|     str[31] = '\0';  // Makes sure no change in the format to strncpy leaves | ||||
|                      // the result unterminated. | ||||
|   } | ||||
|   return str; | ||||
| } | ||||
|  | ||||
|   return String(str); | ||||
| // The following two functions only make sense if the the system | ||||
| // uses UTF-16 for wide string encoding. All supported systems | ||||
| // with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. | ||||
|  | ||||
| // Determines if the arguments constitute UTF-16 surrogate pair | ||||
| // and thus should be combined into a single Unicode code point | ||||
| // using CreateCodePointFromUtf16SurrogatePair. | ||||
| inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { | ||||
|   if (sizeof(wchar_t) == 2) | ||||
|     return (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; | ||||
|   else | ||||
|     return false; | ||||
| } | ||||
|  | ||||
| // Creates a Unicode code point from UTF16 surrogate pair. | ||||
| inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, | ||||
|                                                     wchar_t second) { | ||||
|   if (sizeof(wchar_t) == 2) { | ||||
|     const UInt32 mask = (1 << 10) - 1; | ||||
|     return (((first & mask) << 10) | (second & mask)) + 0x10000; | ||||
|   } else { | ||||
|     // This should not be called, but we provide a sensible default | ||||
|     // in case it is. | ||||
|     return static_cast<UInt32>(first); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Converts a wide string to a narrow string in UTF-8 encoding. | ||||
| // The wide string is assumed to have the following encoding: | ||||
| //   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) | ||||
| //   UTF-32 if sizeof(wchar_t) == 4 (on Linux) | ||||
| // Parameter str points to a null-terminated wide string. | ||||
| // Parameter num_chars may additionally limit the number | ||||
| // of wchar_t characters processed. -1 is used when the entire string | ||||
| // should be processed. | ||||
| // If the string contains code points that are not valid Unicode code points | ||||
| // (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output | ||||
| // as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding | ||||
| // and contains invalid UTF-16 surrogate pairs, values in those pairs | ||||
| // will be encoded as individual Unicode characters from Basic Normal Plane. | ||||
| String WideStringToUtf8(const wchar_t* str, int num_chars) { | ||||
|   if (num_chars == -1) | ||||
|     num_chars = wcslen(str); | ||||
|  | ||||
|   StrStream stream; | ||||
|   for (int i = 0; i < num_chars; ++i) { | ||||
|     UInt32 unicode_code_point; | ||||
|  | ||||
|     if (str[i] == L'\0') { | ||||
|       break; | ||||
|     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { | ||||
|       unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], | ||||
|                                                                  str[i + 1]); | ||||
|       i++; | ||||
|     } else { | ||||
|       unicode_code_point = static_cast<UInt32>(str[i]); | ||||
|     } | ||||
|  | ||||
|     char buffer[32];  // CodePointToUtf8 requires a buffer this big. | ||||
|     stream << CodePointToUtf8(unicode_code_point, buffer); | ||||
|   } | ||||
|   return StrStreamToString(&stream); | ||||
| } | ||||
|  | ||||
| // Converts a wide C string to a String using the UTF-8 encoding. | ||||
| @@ -1349,12 +1441,7 @@ String ToUtf8String(wchar_t wchar) { | ||||
| String String::ShowWideCString(const wchar_t * wide_c_str) { | ||||
|   if (wide_c_str == NULL) return String("(null)"); | ||||
|  | ||||
|   StrStream ss; | ||||
|   while (*wide_c_str) { | ||||
|     ss << internal::ToUtf8String(*wide_c_str++); | ||||
|   } | ||||
|  | ||||
|   return internal::StrStreamToString(&ss); | ||||
|   return String(internal::WideStringToUtf8(wide_c_str, -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Similar to ShowWideCString(), except that this function encloses | ||||
|   | ||||
| @@ -101,6 +101,7 @@ using testing::TPRT_NONFATAL_FAILURE; | ||||
| using testing::TPRT_SUCCESS; | ||||
| using testing::UnitTest; | ||||
| using testing::internal::AppendUserMessage; | ||||
| using testing::internal::CodePointToUtf8; | ||||
| using testing::internal::EqFailure; | ||||
| using testing::internal::FloatingPoint; | ||||
| using testing::internal::GTestFlagSaver; | ||||
| @@ -111,8 +112,8 @@ using testing::internal::StreamableToString; | ||||
| using testing::internal::String; | ||||
| using testing::internal::TestProperty; | ||||
| using testing::internal::TestResult; | ||||
| using testing::internal::ToUtf8String; | ||||
| using testing::internal::UnitTestImpl; | ||||
| using testing::internal::WideStringToUtf8; | ||||
|  | ||||
| // This line tests that we can define tests in an unnamed namespace. | ||||
| namespace { | ||||
| @@ -142,65 +143,184 @@ TEST(NullLiteralTest, IsFalseForNonNullLiterals) { | ||||
| } | ||||
|  | ||||
| #endif  // __SYMBIAN32__ | ||||
| // Tests ToUtf8String(). | ||||
| // | ||||
| // Tests CodePointToUtf8(). | ||||
|  | ||||
| // Tests that the NUL character L'\0' is encoded correctly. | ||||
| TEST(ToUtf8StringTest, CanEncodeNul) { | ||||
|   EXPECT_STREQ("", ToUtf8String(L'\0').c_str()); | ||||
| TEST(CodePointToUtf8Test, CanEncodeNul) { | ||||
|   char buffer[32]; | ||||
|   EXPECT_STREQ("", CodePointToUtf8(L'\0', buffer)); | ||||
| } | ||||
|  | ||||
| // Tests that ASCII characters are encoded correctly. | ||||
| TEST(ToUtf8StringTest, CanEncodeAscii) { | ||||
|   EXPECT_STREQ("a", ToUtf8String(L'a').c_str()); | ||||
|   EXPECT_STREQ("Z", ToUtf8String(L'Z').c_str()); | ||||
|   EXPECT_STREQ("&", ToUtf8String(L'&').c_str()); | ||||
|   EXPECT_STREQ("\x7F", ToUtf8String(L'\x7F').c_str()); | ||||
| TEST(CodePointToUtf8Test, CanEncodeAscii) { | ||||
|   char buffer[32]; | ||||
|   EXPECT_STREQ("a", CodePointToUtf8(L'a', buffer)); | ||||
|   EXPECT_STREQ("Z", CodePointToUtf8(L'Z', buffer)); | ||||
|   EXPECT_STREQ("&", CodePointToUtf8(L'&', buffer)); | ||||
|   EXPECT_STREQ("\x7F", CodePointToUtf8(L'\x7F', buffer)); | ||||
| } | ||||
|  | ||||
| // Tests that Unicode code-points that have 8 to 11 bits are encoded | ||||
| // as 110xxxxx 10xxxxxx. | ||||
| TEST(ToUtf8StringTest, CanEncode8To11Bits) { | ||||
| TEST(CodePointToUtf8Test, CanEncode8To11Bits) { | ||||
|   char buffer[32]; | ||||
|   // 000 1101 0011 => 110-00011 10-010011 | ||||
|   EXPECT_STREQ("\xC3\x93", ToUtf8String(L'\xD3').c_str()); | ||||
|   EXPECT_STREQ("\xC3\x93", CodePointToUtf8(L'\xD3', buffer)); | ||||
|  | ||||
|   // 101 0111 0110 => 110-10101 10-110110 | ||||
|   EXPECT_STREQ("\xD5\xB6", ToUtf8String(L'\x576').c_str()); | ||||
|   EXPECT_STREQ("\xD5\xB6", CodePointToUtf8(L'\x576', buffer)); | ||||
| } | ||||
|  | ||||
| // Tests that Unicode code-points that have 12 to 16 bits are encoded | ||||
| // as 1110xxxx 10xxxxxx 10xxxxxx. | ||||
| TEST(ToUtf8StringTest, CanEncode12To16Bits) { | ||||
| TEST(CodePointToUtf8Test, CanEncode12To16Bits) { | ||||
|   char buffer[32]; | ||||
|   // 0000 1000 1101 0011 => 1110-0000 10-100011 10-010011 | ||||
|   EXPECT_STREQ("\xE0\xA3\x93", ToUtf8String(L'\x8D3').c_str()); | ||||
|   EXPECT_STREQ("\xE0\xA3\x93", CodePointToUtf8(L'\x8D3', buffer)); | ||||
|  | ||||
|   // 1100 0111 0100 1101 => 1110-1100 10-011101 10-001101 | ||||
|   EXPECT_STREQ("\xEC\x9D\x8D", ToUtf8String(L'\xC74D').c_str()); | ||||
|   EXPECT_STREQ("\xEC\x9D\x8D", CodePointToUtf8(L'\xC74D', buffer)); | ||||
| } | ||||
|  | ||||
| #if !defined(GTEST_OS_WINDOWS) && !defined(GTEST_OS_CYGWIN) && \ | ||||
|     !defined(__SYMBIAN32__) | ||||
|  | ||||
| #ifndef GTEST_WIDE_STRING_USES_UTF16_ | ||||
| // Tests in this group require a wchar_t to hold > 16 bits, and thus | ||||
| // are skipped on Windows, Cygwin, and Symbian, where a wchar_t is | ||||
| // 16-bit wide. | ||||
| // 16-bit wide. This code may not compile on those systems. | ||||
|  | ||||
| // Tests that Unicode code-points that have 17 to 21 bits are encoded | ||||
| // as 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. | ||||
| TEST(ToUtf8StringTest, CanEncode17To21Bits) { | ||||
| TEST(CodePointToUtf8Test, CanEncode17To21Bits) { | ||||
|   char buffer[32]; | ||||
|   // 0 0001 0000 1000 1101 0011 => 11110-000 10-010000 10-100011 10-010011 | ||||
|   EXPECT_STREQ("\xF0\x90\xA3\x93", ToUtf8String(L'\x108D3').c_str()); | ||||
|   EXPECT_STREQ("\xF0\x90\xA3\x93", CodePointToUtf8(L'\x108D3', buffer)); | ||||
|  | ||||
|   // 1 0111 1000 0110 0011 0100 => 11110-101 10-111000 10-011000 10-110100 | ||||
|   EXPECT_STREQ("\xF5\xB8\x98\xB4", ToUtf8String(L'\x178634').c_str()); | ||||
|   // 0 0001 0000 0100 0000 0000 => 11110-000 10-010000 10-010000 10-000000 | ||||
|   EXPECT_STREQ("\xF0\x90\x90\x80", CodePointToUtf8(L'\x10400', buffer)); | ||||
|  | ||||
|   // 1 0000 1000 0110 0011 0100 => 11110-100 10-001000 10-011000 10-110100 | ||||
|   EXPECT_STREQ("\xF4\x88\x98\xB4", CodePointToUtf8(L'\x108634', buffer)); | ||||
| } | ||||
|  | ||||
| // Tests that encoding an invalid code-point generates the expected result. | ||||
| TEST(ToUtf8StringTest, CanEncodeInvalidCodePoint) { | ||||
| TEST(CodePointToUtf8Test, CanEncodeInvalidCodePoint) { | ||||
|   char buffer[32]; | ||||
|   EXPECT_STREQ("(Invalid Unicode 0x1234ABCD)", | ||||
|                ToUtf8String(L'\x1234ABCD').c_str()); | ||||
|                CodePointToUtf8(L'\x1234ABCD', buffer)); | ||||
| } | ||||
|  | ||||
| #endif  // Windows, Cygwin, or Symbian | ||||
| #endif  // GTEST_WIDE_STRING_USES_UTF16_ | ||||
|  | ||||
| // Tests WideStringToUtf8(). | ||||
|  | ||||
| // Tests that the NUL character L'\0' is encoded correctly. | ||||
| TEST(WideStringToUtf8Test, CanEncodeNul) { | ||||
|   EXPECT_STREQ("", WideStringToUtf8(L"", 0).c_str()); | ||||
|   EXPECT_STREQ("", WideStringToUtf8(L"", -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that ASCII strings are encoded correctly. | ||||
| TEST(WideStringToUtf8Test, CanEncodeAscii) { | ||||
|   EXPECT_STREQ("a", WideStringToUtf8(L"a", 1).c_str()); | ||||
|   EXPECT_STREQ("ab", WideStringToUtf8(L"ab", 2).c_str()); | ||||
|   EXPECT_STREQ("a", WideStringToUtf8(L"a", -1).c_str()); | ||||
|   EXPECT_STREQ("ab", WideStringToUtf8(L"ab", -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that Unicode code-points that have 8 to 11 bits are encoded | ||||
| // as 110xxxxx 10xxxxxx. | ||||
| TEST(WideStringToUtf8Test, CanEncode8To11Bits) { | ||||
|   // 000 1101 0011 => 110-00011 10-010011 | ||||
|   EXPECT_STREQ("\xC3\x93", WideStringToUtf8(L"\xD3", 1).c_str()); | ||||
|   EXPECT_STREQ("\xC3\x93", WideStringToUtf8(L"\xD3", -1).c_str()); | ||||
|  | ||||
|   // 101 0111 0110 => 110-10101 10-110110 | ||||
|   EXPECT_STREQ("\xD5\xB6", WideStringToUtf8(L"\x576", 1).c_str()); | ||||
|   EXPECT_STREQ("\xD5\xB6", WideStringToUtf8(L"\x576", -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that Unicode code-points that have 12 to 16 bits are encoded | ||||
| // as 1110xxxx 10xxxxxx 10xxxxxx. | ||||
| TEST(WideStringToUtf8Test, CanEncode12To16Bits) { | ||||
|   // 0000 1000 1101 0011 => 1110-0000 10-100011 10-010011 | ||||
|   EXPECT_STREQ("\xE0\xA3\x93", WideStringToUtf8(L"\x8D3", 1).c_str()); | ||||
|   EXPECT_STREQ("\xE0\xA3\x93", WideStringToUtf8(L"\x8D3", -1).c_str()); | ||||
|  | ||||
|   // 1100 0111 0100 1101 => 1110-1100 10-011101 10-001101 | ||||
|   EXPECT_STREQ("\xEC\x9D\x8D", WideStringToUtf8(L"\xC74D", 1).c_str()); | ||||
|   EXPECT_STREQ("\xEC\x9D\x8D", WideStringToUtf8(L"\xC74D", -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that the conversion stops when the function encounters \0 character. | ||||
| TEST(WideStringToUtf8Test, StopsOnNulCharacter) { | ||||
|   EXPECT_STREQ("ABC", WideStringToUtf8(L"ABC\0XYZ", 100).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that the conversion stops when the function reaches the limit | ||||
| // specified by the 'length' parameter. | ||||
| TEST(WideStringToUtf8Test, StopsWhenLengthLimitReached) { | ||||
|   EXPECT_STREQ("ABC", WideStringToUtf8(L"ABCDEF", 3).c_str()); | ||||
| } | ||||
|  | ||||
|  | ||||
| #ifndef GTEST_WIDE_STRING_USES_UTF16_ | ||||
| // Tests that Unicode code-points that have 17 to 21 bits are encoded | ||||
| // as 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. This code may not compile | ||||
| // on the systems using UTF-16 encoding. | ||||
| TEST(WideStringToUtf8Test, CanEncode17To21Bits) { | ||||
|   // 0 0001 0000 1000 1101 0011 => 11110-000 10-010000 10-100011 10-010011 | ||||
|   EXPECT_STREQ("\xF0\x90\xA3\x93", WideStringToUtf8(L"\x108D3", 1).c_str()); | ||||
|   EXPECT_STREQ("\xF0\x90\xA3\x93", WideStringToUtf8(L"\x108D3", -1).c_str()); | ||||
|  | ||||
|   // 1 0000 1000 0110 0011 0100 => 11110-100 10-001000 10-011000 10-110100 | ||||
|   EXPECT_STREQ("\xF4\x88\x98\xB4", WideStringToUtf8(L"\x108634", 1).c_str()); | ||||
|   EXPECT_STREQ("\xF4\x88\x98\xB4", WideStringToUtf8(L"\x108634", -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that encoding an invalid code-point generates the expected result. | ||||
| TEST(WideStringToUtf8Test, CanEncodeInvalidCodePoint) { | ||||
|   EXPECT_STREQ("(Invalid Unicode 0xABCDFF)", | ||||
|                WideStringToUtf8(L"\xABCDFF", -1).c_str()); | ||||
| } | ||||
| #else | ||||
| // Tests that surrogate pairs are encoded correctly on the systems using | ||||
| // UTF-16 encoding in the wide strings. | ||||
| TEST(WideStringToUtf8Test, CanEncodeValidUtf16SUrrogatePairs) { | ||||
|   EXPECT_STREQ("\xF0\x90\x90\x80", | ||||
|                WideStringToUtf8(L"\xD801\xDC00", -1).c_str()); | ||||
| } | ||||
|  | ||||
| // Tests that encoding an invalid UTF-16 surrogate pair | ||||
| // generates the expected result. | ||||
| TEST(WideStringToUtf8Test, CanEncodeInvalidUtf16SurrogatePair) { | ||||
|   // Leading surrogate is at the end of the string. | ||||
|   EXPECT_STREQ("\xED\xA0\x80", WideStringToUtf8(L"\xD800", -1).c_str()); | ||||
|   // Leading surrogate is not followed by the trailing surrogate. | ||||
|   EXPECT_STREQ("\xED\xA0\x80$", WideStringToUtf8(L"\xD800$", -1).c_str()); | ||||
|   // Trailing surrogate appearas without a leading surrogate. | ||||
|   EXPECT_STREQ("\xED\xB0\x80PQR", WideStringToUtf8(L"\xDC00PQR", -1).c_str()); | ||||
| } | ||||
| #endif  // GTEST_WIDE_STRING_USES_UTF16_ | ||||
|  | ||||
| // Tests that codepoint concatenation works correctly. | ||||
| #ifndef GTEST_WIDE_STRING_USES_UTF16_ | ||||
| TEST(WideStringToUtf8Test, ConcatenatesCodepointsCorrectly) { | ||||
|   EXPECT_STREQ( | ||||
|       "\xF4\x88\x98\xB4" | ||||
|           "\xEC\x9D\x8D" | ||||
|           "\n" | ||||
|           "\xD5\xB6" | ||||
|           "\xE0\xA3\x93" | ||||
|           "\xF4\x88\x98\xB4", | ||||
|       WideStringToUtf8(L"\x108634\xC74D\n\x576\x8D3\x108634", -1).c_str()); | ||||
| } | ||||
| #else | ||||
| TEST(WideStringToUtf8Test, ConcatenatesCodepointsCorrectly) { | ||||
|   EXPECT_STREQ( | ||||
|       "\xEC\x9D\x8D" "\n" "\xD5\xB6" "\xE0\xA3\x93", | ||||
|       WideStringToUtf8(L"\xC74D\n\x576\x8D3", -1).c_str()); | ||||
| } | ||||
| #endif  // GTEST_WIDE_STRING_USES_UTF16_ | ||||
|  | ||||
| // Tests the List template class. | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 vladlosev
					vladlosev