2017-08-24 22:27:27 +02:00
|
|
|
/** @file
|
|
|
|
* @author Edouard DUPIN
|
|
|
|
* @copyright 2011, Edouard DUPIN, all right reserved
|
|
|
|
* @license MPL v2.0 (see license file)
|
|
|
|
*/
|
|
|
|
|
2017-08-28 00:02:11 +02:00
|
|
|
#include <etk/utf8.hpp>
|
|
|
|
//#include <etk/debug.hpp>
|
2017-08-24 22:27:27 +02:00
|
|
|
#include <etk/String.hpp>
|
2017-08-28 00:02:11 +02:00
|
|
|
#include <etk/UString.hpp>
|
2017-08-24 22:27:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const char32_t u32char::Null('\0');
|
|
|
|
const char32_t u32char::Return('\n');
|
|
|
|
const char32_t u32char::CarrierReturn('\r');
|
|
|
|
const char32_t u32char::Tabulation('\t');
|
|
|
|
const char32_t u32char::Suppress((const char)127);
|
|
|
|
const char32_t u32char::Delete((const char)8);
|
|
|
|
const char32_t u32char::Space(' ');
|
|
|
|
const char32_t u32char::Escape((const char)27);
|
|
|
|
|
|
|
|
bool u32char::isWhiteChar(char32_t _val) {
|
|
|
|
if( _val == ' '
|
|
|
|
|| _val == '\t'
|
|
|
|
|| _val == '\n'
|
|
|
|
|| _val == '\r') {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool u32char::isSpecialChar(char32_t _val) {
|
|
|
|
if( _val < '0'
|
|
|
|
|| (_val > '9' && _val < 'A')
|
|
|
|
|| (_val > 'Z' && _val < 'a')
|
|
|
|
|| (_val > 'z' && _val < 0xFF) ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool u32char::isInteger(char32_t _val) {
|
|
|
|
if( _val >= (uint32_t)'0'
|
|
|
|
&& _val <= (uint32_t)'9') {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t u32char::toInt(char32_t _val) {
|
|
|
|
return _val - (uint32_t)'0';
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
char32_t u32char::changeOrder(char32_t _val) {
|
|
|
|
if (_val >= 'A' && _val <= 'Z') {
|
|
|
|
return (_val - (uint32_t)'A')*2 + 'A';
|
|
|
|
}
|
|
|
|
if (_val >= 'a' && _val <= 'z') {
|
|
|
|
return (_val - (uint32_t)'a')*2 + 'A' + 1;
|
|
|
|
}
|
|
|
|
if (_val >= ':' && _val <= '@') {
|
|
|
|
return _val + 52;
|
|
|
|
}
|
|
|
|
if (_val >= '[' && _val <= '`') {
|
|
|
|
return _val +26;
|
|
|
|
}
|
|
|
|
return _val;
|
|
|
|
}
|
|
|
|
|
2018-08-23 21:01:35 +02:00
|
|
|
int8_t u32char::convertUtf8(char32_t _val, char _output[7]) {
|
2017-08-24 22:27:27 +02:00
|
|
|
if (_val <= 127) {
|
2018-08-23 21:01:35 +02:00
|
|
|
// input ==> -------- -------- -------- -1111111
|
|
|
|
// output ==> 00000000 00000000 00000000 00000000 00000000 0xxxxxxx
|
|
|
|
_output[0] = (char)_val;
|
|
|
|
_output[1] = 0;
|
2017-08-24 22:27:27 +02:00
|
|
|
return 1;
|
2018-08-23 21:01:35 +02:00
|
|
|
} else if (_val <= 0x000007FF) {
|
|
|
|
// input ==> -------- -------- -----222 22111111
|
|
|
|
// output ==> 00000000 00000000 00000000 00000000 110xxxxx 10xxxxxx
|
|
|
|
_output[0] = 0xC0 | ( (_val & 0x000007C0) >> 6 );
|
|
|
|
_output[1] = 0x80 | ( (_val & 0x0000003F) );
|
|
|
|
_output[2] = 0;
|
2017-08-24 22:27:27 +02:00
|
|
|
return 2;
|
2018-08-23 21:01:35 +02:00
|
|
|
} else if (_val <= 0x0000FFFF) {
|
|
|
|
// input ==> -------- -------- 33332222 22111111
|
|
|
|
// output ==> 00000000 00000000 00000000 1110xxxx 10xxxxxx 10xxxxxx
|
|
|
|
_output[0] = 0xE0 | ( (_val & 0x0000F000) >> 12 );
|
|
|
|
_output[1] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
|
|
|
|
_output[2] = 0x80 | ( (_val & 0x0000003F) );
|
|
|
|
_output[3] = 0;
|
2017-08-24 22:27:27 +02:00
|
|
|
return 3;
|
2018-08-23 21:01:35 +02:00
|
|
|
} else if (_val <= 0x001FFFFF) {
|
|
|
|
// input ==> -------- ---44433 33332222 22111111
|
|
|
|
// output ==> 00000000 00000000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
_output[0] = 0xF0 | ( (_val & 0x001C0000) >> 18 );
|
|
|
|
_output[1] = 0x80 | ( (_val & 0x0003F000) >> 12 );
|
|
|
|
_output[2] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
|
|
|
|
_output[3] = 0x80 | ( (_val & 0x0000003F) );
|
|
|
|
_output[4] = 0;
|
2017-08-24 22:27:27 +02:00
|
|
|
return 4;
|
2018-08-23 21:01:35 +02:00
|
|
|
// the next element is my personal interpretation...
|
|
|
|
} else if (_val <= 0x03FFFFFF) {
|
|
|
|
// input ==> ------55 44444433 33332222 22111111
|
|
|
|
// output ==> 00000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
_output[0] = 0xF8 | ( (_val & 0x03000000) >> 24 );
|
|
|
|
_output[1] = 0x80 | ( (_val & 0x00FC0000) >> 18 );
|
|
|
|
_output[2] = 0x80 | ( (_val & 0x0003F000) >> 12 );
|
|
|
|
_output[3] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
|
|
|
|
_output[4] = 0x80 | ( (_val & 0x0000003F) );
|
|
|
|
_output[5] = 0;
|
|
|
|
return 5;
|
|
|
|
} else {
|
|
|
|
// input ==> 66555555 44444433 33332222 22111111
|
|
|
|
// output ==> 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
_output[0] = 0xFC | ( (_val & 0xC0000000) >> 30 );
|
|
|
|
_output[1] = 0x80 | ( (_val & 0x3F000000) >> 24 );
|
|
|
|
_output[2] = 0x80 | ( (_val & 0x00FC0000) >> 18 );
|
|
|
|
_output[3] = 0x80 | ( (_val & 0x0003F000) >> 12 );
|
|
|
|
_output[4] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
|
|
|
|
_output[5] = 0x80 | ( (_val & 0x0000003F) );
|
|
|
|
_output[6] = 0;
|
|
|
|
return 6;
|
2017-08-24 22:27:27 +02:00
|
|
|
}
|
|
|
|
}
|
2017-08-28 00:02:11 +02:00
|
|
|
|
|
|
|
etk::String u32char::convertToUtf8(const etk::UString& _input) {
|
|
|
|
return etk::toString(_input);
|
|
|
|
}
|
|
|
|
|
2018-10-09 23:06:37 +02:00
|
|
|
etk::String u32char::convertToUtf8(char32_t _input) {
|
|
|
|
return etk::toString(etk::UString(_input));
|
|
|
|
}
|
|
|
|
|
2017-08-28 00:02:11 +02:00
|
|
|
size_t u32char::strlen(const char32_t* _input) {
|
|
|
|
uint32_t out = 0;
|
2017-08-29 01:22:26 +02:00
|
|
|
while (*_input != 0) {
|
2017-08-28 00:02:11 +02:00
|
|
|
out++;
|
|
|
|
_input++;
|
2017-08-24 22:27:27 +02:00
|
|
|
}
|
2017-08-28 00:02:11 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
class DoubleChar {
|
|
|
|
public:
|
|
|
|
char32_t lower;
|
|
|
|
char32_t upper;
|
|
|
|
};
|
|
|
|
DoubleChar conversionTable[] = {
|
|
|
|
{U'ç', U'Ç'},
|
|
|
|
|
|
|
|
{U'á', U'Á'}, {U'à', U'À'}, {U'ä', U'Ä'}, {U'â', U'Â'}, {U'å', U'Å'}, {U'ã', U'Ã'},
|
|
|
|
{U'é', U'É'}, {U'è', U'È'}, {U'ë', U'Ë'}, {U'ê', U'Ê'},
|
|
|
|
{U'ú', U'Ú'}, {U'ù', U'Ù'}, {U'ü', U'Ü'}, {U'û', U'Û'},
|
|
|
|
{U'í', U'Í'}, {U'ì', U'Ì'}, {U'ï', U'Ï'}, {U'î', U'Î'},
|
|
|
|
{U'ó', U'Ó'}, {U'ò', U'Ò'}, {U'ö', U'Ö'}, {U'ô', U'Ô'}, {U'õ', U'Õ'},
|
|
|
|
{U'ý', U'Ý'}, {U'ỳ', U'Ỳ'}, {U'ÿ', U'Ÿ'}, {U'ŷ', U'Ŷ'},
|
|
|
|
|
|
|
|
{U'ñ', U'Ñ'}, {U'ǹ', U'Ǹ'},
|
|
|
|
|
|
|
|
{U'ḧ', U'Ḧ'}, {U'ĥ', U'Ĥ'},
|
|
|
|
|
|
|
|
{U'ẅ', U'Ẅ'}, {U'ŵ', U'Ŵ'}, {U'ẁ', U'Ẁ'},
|
|
|
|
|
|
|
|
{U'ẍ', U'Ẍ'},
|
|
|
|
|
|
|
|
{U'æ', U'Æ'},
|
|
|
|
{U'ð', U'Ð'},
|
|
|
|
{U'ø', U'Ø'}
|
|
|
|
};
|
|
|
|
size_t conversionTableSize = sizeof(conversionTable)/sizeof(DoubleChar);
|
|
|
|
|
|
|
|
char32_t u32char::toUpper(char32_t _input) {
|
|
|
|
if (_input >= 'a' && _input <= 'z') {
|
|
|
|
return _input + ((int)'A'-(int)'a');
|
|
|
|
}
|
|
|
|
for (size_t iii = 0; iii < conversionTableSize; ++iii) {
|
|
|
|
if (conversionTable[iii].lower == _input) {
|
|
|
|
return conversionTable[iii].upper;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return _input;
|
|
|
|
}
|
|
|
|
|
|
|
|
char32_t u32char::toLower(char32_t _input) {
|
|
|
|
if (_input >= 'A' && _input <= 'Z') {
|
|
|
|
return _input + ((int)'a'-(int)'A');
|
|
|
|
}
|
|
|
|
for (size_t iii = 0; iii < conversionTableSize; ++iii) {
|
|
|
|
if (conversionTable[iii].upper == _input) {
|
|
|
|
return conversionTable[iii].lower;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return _input;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-08-24 22:27:27 +02:00
|
|
|
|
|
|
|
static uint8_t sizeElement(const char* _data, int32_t _lenMax) {
|
2018-10-09 23:06:37 +02:00
|
|
|
uint8_t size = 1;
|
2017-08-28 00:02:11 +02:00
|
|
|
//TK_ASSERT(0 <= _lenMax, "size can not be < 0 ...");
|
2017-08-24 22:27:27 +02:00
|
|
|
if (0 > _lenMax) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
//4 case
|
|
|
|
if( _lenMax >= 1
|
|
|
|
&& (_data[0] & 0x80) == 0x00 ) {
|
|
|
|
// One Char Element
|
|
|
|
size = 1;
|
|
|
|
} else if( _lenMax >= 2
|
|
|
|
&& (_data[0] & 0xE0) == 0xC0
|
|
|
|
&& (_data[1] & 0xC0) == 0x80) {
|
|
|
|
size = 2;
|
|
|
|
} else if( _lenMax >= 3
|
|
|
|
&& (_data[0] & 0xF0) == 0xE0
|
|
|
|
&& (_data[1] & 0xC0) == 0x80
|
|
|
|
&& (_data[2] & 0xC0) == 0x80) {
|
|
|
|
size = 3;
|
|
|
|
} else if( _lenMax >= 4
|
|
|
|
&& (_data[0] & 0xF8) == 0xF0
|
|
|
|
&& (_data[1] & 0xC0) == 0x80
|
|
|
|
&& (_data[2] & 0xC0) == 0x80
|
|
|
|
&& (_data[3] & 0xC0) == 0x80) {
|
|
|
|
size = 4;
|
2018-08-23 21:01:35 +02:00
|
|
|
} else if( _lenMax >= 5
|
|
|
|
&& (_data[0] & 0xFC) == 0xF8
|
|
|
|
&& (_data[1] & 0xC0) == 0x80
|
|
|
|
&& (_data[2] & 0xC0) == 0x80
|
|
|
|
&& (_data[3] & 0xC0) == 0x80
|
|
|
|
&& (_data[4] & 0xC0) == 0x80) {
|
|
|
|
size = 5;
|
|
|
|
} else if( _lenMax >= 6
|
|
|
|
&& (_data[0] & 0xFC) == 0xFC
|
|
|
|
&& (_data[1] & 0xC0) == 0x80
|
|
|
|
&& (_data[2] & 0xC0) == 0x80
|
|
|
|
&& (_data[3] & 0xC0) == 0x80
|
|
|
|
&& (_data[4] & 0xC0) == 0x80
|
|
|
|
&& (_data[5] & 0xC0) == 0x80) {
|
|
|
|
size = 6;
|
2017-08-24 22:27:27 +02:00
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
char32_t utf8::convertChar32(const char* _input) {
|
|
|
|
char32_t value = 0;
|
2018-06-19 22:15:52 +02:00
|
|
|
if (null == _input) {
|
2017-08-24 22:27:27 +02:00
|
|
|
return value;
|
|
|
|
}
|
|
|
|
int32_t len = strlen(_input);
|
|
|
|
len = sizeElement(_input, len);
|
|
|
|
switch (len) {
|
|
|
|
case 1:
|
2018-08-23 21:01:35 +02:00
|
|
|
// input ==> 00000000 00000000 00000000 00000000 00000000 0xxxxxxx
|
|
|
|
// output ==> -------- -------- -------- -1111111
|
2017-08-24 22:27:27 +02:00
|
|
|
value = (uint8_t)(_input[0]) & 0x7F;
|
|
|
|
return value;
|
|
|
|
case 2:
|
2018-08-23 21:01:35 +02:00
|
|
|
// input ==> 00000000 00000000 00000000 00000000 110xxxxx 10xxxxxx
|
|
|
|
// output ==> -------- -------- -----222 22111111
|
2017-08-24 22:27:27 +02:00
|
|
|
value = (((uint8_t)_input[0]) & 0x1F)<< 6;
|
2018-08-23 21:01:35 +02:00
|
|
|
value += (((uint8_t)_input[1]) & 0x3F);
|
2017-08-24 22:27:27 +02:00
|
|
|
return value;
|
|
|
|
case 3:
|
2018-08-23 21:01:35 +02:00
|
|
|
// input ==> 00000000 00000000 00000000 1110xxxx 10xxxxxx 10xxxxxx
|
|
|
|
// output ==> -------- -------- 33332222 22111111
|
2017-08-24 22:27:27 +02:00
|
|
|
value = (((uint8_t)_input[0]) & 0x0F)<< 12;
|
|
|
|
value += (((uint8_t)_input[1]) & 0x3F)<< 6;
|
2018-08-23 21:01:35 +02:00
|
|
|
value += (((uint8_t)_input[2]) & 0x3F);
|
2017-08-24 22:27:27 +02:00
|
|
|
return value;
|
|
|
|
case 4:
|
2018-08-23 21:01:35 +02:00
|
|
|
// input ==> 00000000 00000000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
// output ==> -------- ---44433 33332222 22111111
|
2017-08-24 22:27:27 +02:00
|
|
|
value = (((uint8_t)_input[0]) & 0x07)<< 18;
|
|
|
|
value += (((uint8_t)_input[1]) & 0x3F)<< 12;
|
|
|
|
value += (((uint8_t)_input[2]) & 0x3F)<< 6;
|
2018-08-23 21:01:35 +02:00
|
|
|
value += (((uint8_t)_input[3]) & 0x3F);
|
|
|
|
return value;
|
|
|
|
case 5:
|
|
|
|
// input ==> 00000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
// output ==> ------55 44444433 33332222 22111111
|
|
|
|
value = (((uint8_t)_input[0]) & 0x03)<< 24;
|
|
|
|
value += (((uint8_t)_input[1]) & 0x3F)<< 18;
|
|
|
|
value += (((uint8_t)_input[2]) & 0x3F)<< 12;
|
|
|
|
value += (((uint8_t)_input[3]) & 0x3F)<< 6;
|
|
|
|
value += (((uint8_t)_input[4]) & 0x3F);
|
|
|
|
return value;
|
|
|
|
case 6:
|
|
|
|
// input ==> 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
// output ==> 66555555 44444433 33332222 22111111
|
|
|
|
value = (((uint8_t)_input[0]) & 0x03)<< 30;
|
|
|
|
value += (((uint8_t)_input[1]) & 0x3F)<< 24;
|
|
|
|
value += (((uint8_t)_input[2]) & 0x3F)<< 18;
|
|
|
|
value += (((uint8_t)_input[3]) & 0x3F)<< 12;
|
|
|
|
value += (((uint8_t)_input[4]) & 0x3F)<< 6;
|
|
|
|
value += (((uint8_t)_input[5]) & 0x3F);
|
2017-08-24 22:27:27 +02:00
|
|
|
return value;
|
|
|
|
}
|
2018-08-23 21:01:35 +02:00
|
|
|
// An error occurred...
|
|
|
|
value = _input[0];
|
|
|
|
return value;
|
2017-08-24 22:27:27 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
int8_t utf8::length(const char _input) {
|
|
|
|
if((_input&0x80) == 0x00 ) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if((_input&0xE0) == 0xC0) {
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
if((_input&0xF0) == 0xE0) {
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
if((_input&0xF8) == 0xF0) {
|
|
|
|
return 4;
|
|
|
|
}
|
2018-08-23 21:01:35 +02:00
|
|
|
if((_input&0xFC) == 0xF8) {
|
|
|
|
return 5;
|
|
|
|
}
|
|
|
|
if((_input&0xFC) == 0xFC) {
|
|
|
|
return 5;
|
|
|
|
}
|
2017-08-24 22:27:27 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool utf8::first(const char _input) {
|
|
|
|
// When started with the bit 0 then the size is single element.
|
|
|
|
if((_input&0x80) == 0x00 ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// for multiple element size, we just need to check the second element (might be != 1)
|
|
|
|
if((_input&0x40) == 0x40 ) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2017-08-28 00:02:11 +02:00
|
|
|
|
|
|
|
etk::UString utf8::convertUnicode(const char* _input) {
|
2018-06-19 22:15:52 +02:00
|
|
|
if (_input == null) {
|
2017-08-28 00:02:11 +02:00
|
|
|
return U"";
|
|
|
|
}
|
|
|
|
etk::UString out;
|
|
|
|
char tmpData[20];
|
|
|
|
int64_t pos = 0;
|
|
|
|
int64_t inputLen = strlen(_input);
|
|
|
|
while (pos < inputLen) {
|
|
|
|
int32_t lenMax = inputLen - pos;
|
2018-08-23 21:01:35 +02:00
|
|
|
uint8_t tmpPos = sizeElement(&_input[pos], lenMax);
|
|
|
|
out += utf8::convertChar32(&_input[pos]);
|
|
|
|
pos += tmpPos;
|
2017-08-24 22:27:27 +02:00
|
|
|
}
|
2017-08-28 00:02:11 +02:00
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
etk::UString utf8::convertUnicode(const etk::String& _input) {
|
|
|
|
return utf8::convertUnicode(_input.c_str());
|
|
|
|
}
|