From 056734e5fe81ce54e9303528d5a5b9d5f920382b Mon Sep 17 00:00:00 2001 From: Edouard DUPIN Date: Thu, 23 Aug 2018 21:01:35 +0200 Subject: [PATCH] [DEV] test utf8 --- etk/utf8.cpp | 203 ++++++++++++++++++++++++---------------------- etk/utf8.hpp | 2 +- lutin_etk-test.py | 1 + test/testUTF8.cpp | 25 ++++++ 4 files changed, 132 insertions(+), 99 deletions(-) create mode 100644 test/testUTF8.cpp diff --git a/etk/utf8.cpp b/etk/utf8.cpp index 80244ef..4b62dc9 100644 --- a/etk/utf8.cpp +++ b/etk/utf8.cpp @@ -69,64 +69,59 @@ char32_t u32char::changeOrder(char32_t _val) { return _val; } -static uint32_t getUtf8Val(char32_t _val) { - uint32_t output = 0; +int8_t u32char::convertUtf8(char32_t _val, char _output[7]) { if (_val <= 127) { - output = _val; - } else if (_val <= 2047) { - // output ==> 00000000 00000000 110xxxxx 10xxxxxx - // input ==> -------- -------- -----222 22111111 - output = 0x0000C080; - output+= (_val & 0x000007C0)<<2; - output+= _val & 0x0000003F; - } else if (_val <= 65535) { - // output ==> 00000000 1110xxxx 10xxxxxx 10xxxxxx - // input ==> -------- -------- 33332222 22111111 - output = 0x00E08080; - output+= (_val & 0x0000F000)<<4; - output+= (_val & 0x00000FC0)<<2; - output+= _val & 0x0000003F; - } else if (_val <= 1114111) { - // output ==> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // input ==> -------- ---44433 33332222 22111111 - output = 0xF0808080; - output+= (_val & 0x001C0000)<<6; - output+= (_val & 0x0003F000)<<4; - output+= (_val & 0x00000FC0)<<2; - output+= _val & 0x0000003F; - } else { - //TK_ERROR("NOT UTF8 character input..."); - printf("not an utf8 char : %#08x\n", _val); - return 0; - } - //printf("utf-8 conversion : %d=%08x ==> %08x\n",value, value, output); - return output; -} - -int8_t u32char::convertUtf8(char32_t _val, char _output[5]) { - uint32_t value = getUtf8Val(_val); - if (0xFF >= value) { - _output[0] = (char)value; - _output[1] = '\0'; + // input ==> -------- -------- -------- -1111111 + // output ==> 00000000 00000000 00000000 00000000 00000000 0xxxxxxx + _output[0] = (char)_val; + _output[1] = 0; return 1; - } else if (0xFFFF >= value) { - _output[0] = (char)((value>>8) & 0x000000FF); - _output[1] = (char)value; - _output[2] = '\0'; + } else if (_val <= 0x000007FF) { + // input ==> -------- -------- -----222 22111111 + // output ==> 00000000 00000000 00000000 00000000 110xxxxx 10xxxxxx + _output[0] = 0xC0 | ( (_val & 0x000007C0) >> 6 ); + _output[1] = 0x80 | ( (_val & 0x0000003F) ); + _output[2] = 0; return 2; - } else if (0xFFFFFF >= value) { - _output[0] = (char)((value>>16) & 0x000000FF); - _output[1] = (char)((value>>8) & 0x000000FF); - _output[2] = (char)value; - _output[3] = '\0'; + } else if (_val <= 0x0000FFFF) { + // input ==> -------- -------- 33332222 22111111 + // output ==> 00000000 00000000 00000000 1110xxxx 10xxxxxx 10xxxxxx + _output[0] = 0xE0 | ( (_val & 0x0000F000) >> 12 ); + _output[1] = 0x80 | ( (_val & 0x00000FC0) >> 6 ); + _output[2] = 0x80 | ( (_val & 0x0000003F) ); + _output[3] = 0; return 3; - } else { - _output[0] = (char)((value>>24) & 0x000000FF); - _output[1] = (char)((value>>16) & 0x000000FF); - _output[2] = (char)((value>>8) & 0x000000FF); - _output[3] = (char)value; - _output[4] = '\0'; + } else if (_val <= 0x001FFFFF) { + // input ==> -------- ---44433 33332222 22111111 + // output ==> 00000000 00000000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + _output[0] = 0xF0 | ( (_val & 0x001C0000) >> 18 ); + _output[1] = 0x80 | ( (_val & 0x0003F000) >> 12 ); + _output[2] = 0x80 | ( (_val & 0x00000FC0) >> 6 ); + _output[3] = 0x80 | ( (_val & 0x0000003F) ); + _output[4] = 0; return 4; + // the next element is my personal interpretation... + } else if (_val <= 0x03FFFFFF) { + // input ==> ------55 44444433 33332222 22111111 + // output ==> 00000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + _output[0] = 0xF8 | ( (_val & 0x03000000) >> 24 ); + _output[1] = 0x80 | ( (_val & 0x00FC0000) >> 18 ); + _output[2] = 0x80 | ( (_val & 0x0003F000) >> 12 ); + _output[3] = 0x80 | ( (_val & 0x00000FC0) >> 6 ); + _output[4] = 0x80 | ( (_val & 0x0000003F) ); + _output[5] = 0; + return 5; + } else { + // input ==> 66555555 44444433 33332222 22111111 + // output ==> 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + _output[0] = 0xFC | ( (_val & 0xC0000000) >> 30 ); + _output[1] = 0x80 | ( (_val & 0x3F000000) >> 24 ); + _output[2] = 0x80 | ( (_val & 0x00FC0000) >> 18 ); + _output[3] = 0x80 | ( (_val & 0x0003F000) >> 12 ); + _output[4] = 0x80 | ( (_val & 0x00000FC0) >> 6 ); + _output[5] = 0x80 | ( (_val & 0x0000003F) ); + _output[6] = 0; + return 6; } } @@ -223,6 +218,21 @@ static uint8_t sizeElement(const char* _data, int32_t _lenMax) { && (_data[2] & 0xC0) == 0x80 && (_data[3] & 0xC0) == 0x80) { size = 4; + } else if( _lenMax >= 5 + && (_data[0] & 0xFC) == 0xF8 + && (_data[1] & 0xC0) == 0x80 + && (_data[2] & 0xC0) == 0x80 + && (_data[3] & 0xC0) == 0x80 + && (_data[4] & 0xC0) == 0x80) { + size = 5; + } else if( _lenMax >= 6 + && (_data[0] & 0xFC) == 0xFC + && (_data[1] & 0xC0) == 0x80 + && (_data[2] & 0xC0) == 0x80 + && (_data[3] & 0xC0) == 0x80 + && (_data[4] & 0xC0) == 0x80 + && (_data[5] & 0xC0) == 0x80) { + size = 6; } return size; } @@ -235,29 +245,55 @@ char32_t utf8::convertChar32(const char* _input) { int32_t len = strlen(_input); len = sizeElement(_input, len); switch (len) { - default: - // case 0 : An error occurred... - value = _input[0]; - return value; case 1: + // input ==> 00000000 00000000 00000000 00000000 00000000 0xxxxxxx + // output ==> -------- -------- -------- -1111111 value = (uint8_t)(_input[0]) & 0x7F; return value; case 2: + // input ==> 00000000 00000000 00000000 00000000 110xxxxx 10xxxxxx + // output ==> -------- -------- -----222 22111111 value = (((uint8_t)_input[0]) & 0x1F)<< 6; - value += ((uint8_t)_input[1]) & 0x3F; + value += (((uint8_t)_input[1]) & 0x3F); return value; case 3: + // input ==> 00000000 00000000 00000000 1110xxxx 10xxxxxx 10xxxxxx + // output ==> -------- -------- 33332222 22111111 value = (((uint8_t)_input[0]) & 0x0F)<< 12; value += (((uint8_t)_input[1]) & 0x3F)<< 6; - value += ((uint8_t)_input[2]) & 0x3F; + value += (((uint8_t)_input[2]) & 0x3F); return value; case 4: + // input ==> 00000000 00000000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // output ==> -------- ---44433 33332222 22111111 value = (((uint8_t)_input[0]) & 0x07)<< 18; value += (((uint8_t)_input[1]) & 0x3F)<< 12; value += (((uint8_t)_input[2]) & 0x3F)<< 6; - value += ((uint8_t)_input[3]) & 0x3F; + value += (((uint8_t)_input[3]) & 0x3F); + return value; + case 5: + // input ==> 00000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // output ==> ------55 44444433 33332222 22111111 + value = (((uint8_t)_input[0]) & 0x03)<< 24; + value += (((uint8_t)_input[1]) & 0x3F)<< 18; + value += (((uint8_t)_input[2]) & 0x3F)<< 12; + value += (((uint8_t)_input[3]) & 0x3F)<< 6; + value += (((uint8_t)_input[4]) & 0x3F); + return value; + case 6: + // input ==> 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // output ==> 66555555 44444433 33332222 22111111 + value = (((uint8_t)_input[0]) & 0x03)<< 30; + value += (((uint8_t)_input[1]) & 0x3F)<< 24; + value += (((uint8_t)_input[2]) & 0x3F)<< 18; + value += (((uint8_t)_input[3]) & 0x3F)<< 12; + value += (((uint8_t)_input[4]) & 0x3F)<< 6; + value += (((uint8_t)_input[5]) & 0x3F); return value; } + // An error occurred... + value = _input[0]; + return value; } int8_t utf8::length(const char _input) { @@ -273,6 +309,12 @@ int8_t utf8::length(const char _input) { if((_input&0xF8) == 0xF0) { return 4; } + if((_input&0xFC) == 0xF8) { + return 5; + } + if((_input&0xFC) == 0xFC) { + return 5; + } return 1; } @@ -298,44 +340,9 @@ etk::UString utf8::convertUnicode(const char* _input) { int64_t inputLen = strlen(_input); while (pos < inputLen) { int32_t lenMax = inputLen - pos; - //4 case - if ( 1<=lenMax - && 0x00 == (_input[pos+0] & 0x80) ) { - tmpData[0] = _input[pos+0]; - tmpData[1] = '\0'; - pos += 1; - } else if ( 2<=lenMax - && 0xC0 == (_input[pos+0] & 0xE0) - && 0x80 == (_input[pos+1] & 0xC0) ) { - tmpData[0] = _input[pos+0]; - tmpData[1] = _input[pos+1]; - tmpData[2] = '\0'; - pos += 2; - } else if ( 3<=lenMax - && 0xE0 == (_input[pos+0] & 0xF0) - && 0x80 == (_input[pos+1] & 0xC0) - && 0x80 == (_input[pos+2] & 0xC0)) { - tmpData[0] = _input[pos+0]; - tmpData[1] = _input[pos+1]; - tmpData[2] = _input[pos+2]; - tmpData[3] = '\0'; - pos += 3; - } else if ( 4<=lenMax - && 0xF0 == (_input[pos+0] & 0xF8) - && 0x80 == (_input[pos+1] & 0xC0) - && 0x80 == (_input[pos+2] & 0xC0) - && 0x80 == (_input[pos+3] & 0xC0)) { - tmpData[0] = _input[pos+0]; - tmpData[1] = _input[pos+1]; - tmpData[2] = _input[pos+2]; - tmpData[3] = _input[pos+3]; - tmpData[4] = '\0'; - pos += 4; - } else { - tmpData[0] = '\0'; - pos += 1; - } - out += utf8::convertChar32(tmpData); + uint8_t tmpPos = sizeElement(&_input[pos], lenMax); + out += utf8::convertChar32(&_input[pos]); + pos += tmpPos; } return out; } diff --git a/etk/utf8.hpp b/etk/utf8.hpp index df87716..61142c5 100644 --- a/etk/utf8.hpp +++ b/etk/utf8.hpp @@ -65,7 +65,7 @@ namespace u32char { * @param[out] _output Char data converted * @return Number of char in utf8 */ - int8_t convertUtf8(char32_t _val, char _output[5]); + int8_t convertUtf8(char32_t _val, char _output[7]); etk::String convertToUtf8(const etk::UString& _input); char32_t toUpper(char32_t _input); char32_t toLower(char32_t _input); diff --git a/lutin_etk-test.py b/lutin_etk-test.py index 524c5d0..1e48b13 100644 --- a/lutin_etk-test.py +++ b/lutin_etk-test.py @@ -47,6 +47,7 @@ def configure(target, my_module): 'test/testString.cpp', 'test/testTrait.cpp', 'test/testThrow.cpp', + 'test/testUTF8.cpp', ]) my_module.add_depend([ 'etk', diff --git a/test/testUTF8.cpp b/test/testUTF8.cpp new file mode 100644 index 0000000..2ed9457 --- /dev/null +++ b/test/testUTF8.cpp @@ -0,0 +1,25 @@ +/** + * @author Edouard DUPIN + * @copyright 2011, Edouard DUPIN, all right reserved + * @license MPL v2.0 (see license file) + */ + + +#include +#include +#include +#include "ConstructDestruct.hpp" + +TEST(TestUTF8, full) { + // Test contructor value + for (uint32_t jjj=0; jjj<0XFF; ++jjj) { + TEST_PRINT("range : " << jjj << " / " << 0XFF ); + for (uint32_t iii=0; iii<0XFFFFFF; ++iii) { + char32_t inputValue = iii; + char output[7]; + u32char::convertUtf8(inputValue, output); + char32_t outputValue = utf8::convertChar32(output); + EXPECT_EQ(inputValue, outputValue); + } + } +} \ No newline at end of file