etk/etk-core/utf8.cpp

357 lines
10 KiB
C++
Raw Permalink Normal View History

2017-08-24 22:27:27 +02:00
/** @file
* @author Edouard DUPIN
* @copyright 2011, Edouard DUPIN, all right reserved
* @license MPL v2.0 (see license file)
*/
2017-08-28 00:02:11 +02:00
#include <etk/utf8.hpp>
//#include <etk/debug.hpp>
2017-08-24 22:27:27 +02:00
#include <etk/String.hpp>
2017-08-28 00:02:11 +02:00
#include <etk/UString.hpp>
2017-08-24 22:27:27 +02:00
const char32_t u32char::Null('\0');
const char32_t u32char::Return('\n');
const char32_t u32char::CarrierReturn('\r');
const char32_t u32char::Tabulation('\t');
const char32_t u32char::Suppress((const char)127);
const char32_t u32char::Delete((const char)8);
const char32_t u32char::Space(' ');
const char32_t u32char::Escape((const char)27);
bool u32char::isWhiteChar(char32_t _val) {
if( _val == ' '
|| _val == '\t'
|| _val == '\n'
|| _val == '\r') {
return true;
}
return false;
}
bool u32char::isSpecialChar(char32_t _val) {
if( _val < '0'
|| (_val > '9' && _val < 'A')
|| (_val > 'Z' && _val < 'a')
|| (_val > 'z' && _val < 0xFF) ) {
return true;
}
return false;
}
bool u32char::isInteger(char32_t _val) {
if( _val >= (uint32_t)'0'
&& _val <= (uint32_t)'9') {
return true;
}
return false;
}
int32_t u32char::toInt(char32_t _val) {
return _val - (uint32_t)'0';
}
char32_t u32char::changeOrder(char32_t _val) {
if (_val >= 'A' && _val <= 'Z') {
return (_val - (uint32_t)'A')*2 + 'A';
}
if (_val >= 'a' && _val <= 'z') {
return (_val - (uint32_t)'a')*2 + 'A' + 1;
}
if (_val >= ':' && _val <= '@') {
return _val + 52;
}
if (_val >= '[' && _val <= '`') {
return _val +26;
}
return _val;
}
2018-08-23 21:01:35 +02:00
int8_t u32char::convertUtf8(char32_t _val, char _output[7]) {
2017-08-24 22:27:27 +02:00
if (_val <= 127) {
2018-08-23 21:01:35 +02:00
// input ==> -------- -------- -------- -1111111
// output ==> 00000000 00000000 00000000 00000000 00000000 0xxxxxxx
_output[0] = (char)_val;
_output[1] = 0;
2017-08-24 22:27:27 +02:00
return 1;
2018-08-23 21:01:35 +02:00
} else if (_val <= 0x000007FF) {
// input ==> -------- -------- -----222 22111111
// output ==> 00000000 00000000 00000000 00000000 110xxxxx 10xxxxxx
_output[0] = 0xC0 | ( (_val & 0x000007C0) >> 6 );
_output[1] = 0x80 | ( (_val & 0x0000003F) );
_output[2] = 0;
2017-08-24 22:27:27 +02:00
return 2;
2018-08-23 21:01:35 +02:00
} else if (_val <= 0x0000FFFF) {
// input ==> -------- -------- 33332222 22111111
// output ==> 00000000 00000000 00000000 1110xxxx 10xxxxxx 10xxxxxx
_output[0] = 0xE0 | ( (_val & 0x0000F000) >> 12 );
_output[1] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
_output[2] = 0x80 | ( (_val & 0x0000003F) );
_output[3] = 0;
2017-08-24 22:27:27 +02:00
return 3;
2018-08-23 21:01:35 +02:00
} else if (_val <= 0x001FFFFF) {
// input ==> -------- ---44433 33332222 22111111
// output ==> 00000000 00000000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
_output[0] = 0xF0 | ( (_val & 0x001C0000) >> 18 );
_output[1] = 0x80 | ( (_val & 0x0003F000) >> 12 );
_output[2] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
_output[3] = 0x80 | ( (_val & 0x0000003F) );
_output[4] = 0;
2017-08-24 22:27:27 +02:00
return 4;
2018-08-23 21:01:35 +02:00
// the next element is my personal interpretation...
} else if (_val <= 0x03FFFFFF) {
// input ==> ------55 44444433 33332222 22111111
// output ==> 00000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
_output[0] = 0xF8 | ( (_val & 0x03000000) >> 24 );
_output[1] = 0x80 | ( (_val & 0x00FC0000) >> 18 );
_output[2] = 0x80 | ( (_val & 0x0003F000) >> 12 );
_output[3] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
_output[4] = 0x80 | ( (_val & 0x0000003F) );
_output[5] = 0;
return 5;
} else {
// input ==> 66555555 44444433 33332222 22111111
// output ==> 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
_output[0] = 0xFC | ( (_val & 0xC0000000) >> 30 );
_output[1] = 0x80 | ( (_val & 0x3F000000) >> 24 );
_output[2] = 0x80 | ( (_val & 0x00FC0000) >> 18 );
_output[3] = 0x80 | ( (_val & 0x0003F000) >> 12 );
_output[4] = 0x80 | ( (_val & 0x00000FC0) >> 6 );
_output[5] = 0x80 | ( (_val & 0x0000003F) );
_output[6] = 0;
return 6;
2017-08-24 22:27:27 +02:00
}
}
2017-08-28 00:02:11 +02:00
etk::String u32char::convertToUtf8(const etk::UString& _input) {
return etk::toString(_input);
}
2018-10-09 23:06:37 +02:00
etk::String u32char::convertToUtf8(char32_t _input) {
return etk::toString(etk::UString(_input));
}
2017-08-28 00:02:11 +02:00
size_t u32char::strlen(const char32_t* _input) {
uint32_t out = 0;
2017-08-29 01:22:26 +02:00
while (*_input != 0) {
2017-08-28 00:02:11 +02:00
out++;
_input++;
2017-08-24 22:27:27 +02:00
}
2017-08-28 00:02:11 +02:00
return out;
}
class DoubleChar {
public:
char32_t lower;
char32_t upper;
};
DoubleChar conversionTable[] = {
{U'ç', U'Ç'},
{U'á', U'Á'}, {U'à', U'À'}, {U'ä', U'Ä'}, {U'â', U'Â'}, {U'å', U'Å'}, {U'ã', U'Ã'},
{U'é', U'É'}, {U'è', U'È'}, {U'ë', U'Ë'}, {U'ê', U'Ê'},
{U'ú', U'Ú'}, {U'ù', U'Ù'}, {U'ü', U'Ü'}, {U'û', U'Û'},
{U'í', U'Í'}, {U'ì', U'Ì'}, {U'ï', U'Ï'}, {U'î', U'Î'},
{U'ó', U'Ó'}, {U'ò', U'Ò'}, {U'ö', U'Ö'}, {U'ô', U'Ô'}, {U'õ', U'Õ'},
{U'ý', U'Ý'}, {U'', U''}, {U'ÿ', U'Ÿ'}, {U'ŷ', U'Ŷ'},
{U'ñ', U'Ñ'}, {U'ǹ', U'Ǹ'},
{U'', U''}, {U'ĥ', U'Ĥ'},
{U'', U''}, {U'ŵ', U'Ŵ'}, {U'', U''},
{U'', U''},
{U'æ', U'Æ'},
{U'ð', U'Ð'},
{U'ø', U'Ø'}
};
size_t conversionTableSize = sizeof(conversionTable)/sizeof(DoubleChar);
char32_t u32char::toUpper(char32_t _input) {
if (_input >= 'a' && _input <= 'z') {
return _input + ((int)'A'-(int)'a');
}
for (size_t iii = 0; iii < conversionTableSize; ++iii) {
if (conversionTable[iii].lower == _input) {
return conversionTable[iii].upper;
}
}
return _input;
}
char32_t u32char::toLower(char32_t _input) {
if (_input >= 'A' && _input <= 'Z') {
return _input + ((int)'a'-(int)'A');
}
for (size_t iii = 0; iii < conversionTableSize; ++iii) {
if (conversionTable[iii].upper == _input) {
return conversionTable[iii].lower;
}
}
return _input;
}
2017-08-24 22:27:27 +02:00
static uint8_t sizeElement(const char* _data, int32_t _lenMax) {
2018-10-09 23:06:37 +02:00
uint8_t size = 1;
2017-08-28 00:02:11 +02:00
//TK_ASSERT(0 <= _lenMax, "size can not be < 0 ...");
2017-08-24 22:27:27 +02:00
if (0 > _lenMax) {
return 0;
}
//4 case
if( _lenMax >= 1
&& (_data[0] & 0x80) == 0x00 ) {
// One Char Element
size = 1;
} else if( _lenMax >= 2
&& (_data[0] & 0xE0) == 0xC0
&& (_data[1] & 0xC0) == 0x80) {
size = 2;
} else if( _lenMax >= 3
&& (_data[0] & 0xF0) == 0xE0
&& (_data[1] & 0xC0) == 0x80
&& (_data[2] & 0xC0) == 0x80) {
size = 3;
} else if( _lenMax >= 4
&& (_data[0] & 0xF8) == 0xF0
&& (_data[1] & 0xC0) == 0x80
&& (_data[2] & 0xC0) == 0x80
&& (_data[3] & 0xC0) == 0x80) {
size = 4;
2018-08-23 21:01:35 +02:00
} else if( _lenMax >= 5
&& (_data[0] & 0xFC) == 0xF8
&& (_data[1] & 0xC0) == 0x80
&& (_data[2] & 0xC0) == 0x80
&& (_data[3] & 0xC0) == 0x80
&& (_data[4] & 0xC0) == 0x80) {
size = 5;
} else if( _lenMax >= 6
&& (_data[0] & 0xFC) == 0xFC
&& (_data[1] & 0xC0) == 0x80
&& (_data[2] & 0xC0) == 0x80
&& (_data[3] & 0xC0) == 0x80
&& (_data[4] & 0xC0) == 0x80
&& (_data[5] & 0xC0) == 0x80) {
size = 6;
2017-08-24 22:27:27 +02:00
}
return size;
}
char32_t utf8::convertChar32(const char* _input) {
char32_t value = 0;
if (null == _input) {
2017-08-24 22:27:27 +02:00
return value;
}
int32_t len = strlen(_input);
len = sizeElement(_input, len);
switch (len) {
case 1:
2018-08-23 21:01:35 +02:00
// input ==> 00000000 00000000 00000000 00000000 00000000 0xxxxxxx
// output ==> -------- -------- -------- -1111111
2017-08-24 22:27:27 +02:00
value = (uint8_t)(_input[0]) & 0x7F;
return value;
case 2:
2018-08-23 21:01:35 +02:00
// input ==> 00000000 00000000 00000000 00000000 110xxxxx 10xxxxxx
// output ==> -------- -------- -----222 22111111
2017-08-24 22:27:27 +02:00
value = (((uint8_t)_input[0]) & 0x1F)<< 6;
2018-08-23 21:01:35 +02:00
value += (((uint8_t)_input[1]) & 0x3F);
2017-08-24 22:27:27 +02:00
return value;
case 3:
2018-08-23 21:01:35 +02:00
// input ==> 00000000 00000000 00000000 1110xxxx 10xxxxxx 10xxxxxx
// output ==> -------- -------- 33332222 22111111
2017-08-24 22:27:27 +02:00
value = (((uint8_t)_input[0]) & 0x0F)<< 12;
value += (((uint8_t)_input[1]) & 0x3F)<< 6;
2018-08-23 21:01:35 +02:00
value += (((uint8_t)_input[2]) & 0x3F);
2017-08-24 22:27:27 +02:00
return value;
case 4:
2018-08-23 21:01:35 +02:00
// input ==> 00000000 00000000 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// output ==> -------- ---44433 33332222 22111111
2017-08-24 22:27:27 +02:00
value = (((uint8_t)_input[0]) & 0x07)<< 18;
value += (((uint8_t)_input[1]) & 0x3F)<< 12;
value += (((uint8_t)_input[2]) & 0x3F)<< 6;
2018-08-23 21:01:35 +02:00
value += (((uint8_t)_input[3]) & 0x3F);
return value;
case 5:
// input ==> 00000000 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// output ==> ------55 44444433 33332222 22111111
value = (((uint8_t)_input[0]) & 0x03)<< 24;
value += (((uint8_t)_input[1]) & 0x3F)<< 18;
value += (((uint8_t)_input[2]) & 0x3F)<< 12;
value += (((uint8_t)_input[3]) & 0x3F)<< 6;
value += (((uint8_t)_input[4]) & 0x3F);
return value;
case 6:
// input ==> 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// output ==> 66555555 44444433 33332222 22111111
value = (((uint8_t)_input[0]) & 0x03)<< 30;
value += (((uint8_t)_input[1]) & 0x3F)<< 24;
value += (((uint8_t)_input[2]) & 0x3F)<< 18;
value += (((uint8_t)_input[3]) & 0x3F)<< 12;
value += (((uint8_t)_input[4]) & 0x3F)<< 6;
value += (((uint8_t)_input[5]) & 0x3F);
2017-08-24 22:27:27 +02:00
return value;
}
2018-08-23 21:01:35 +02:00
// An error occurred...
value = _input[0];
return value;
2017-08-24 22:27:27 +02:00
}
int8_t utf8::length(const char _input) {
if((_input&0x80) == 0x00 ) {
return 1;
}
if((_input&0xE0) == 0xC0) {
return 2;
}
if((_input&0xF0) == 0xE0) {
return 3;
}
if((_input&0xF8) == 0xF0) {
return 4;
}
2018-08-23 21:01:35 +02:00
if((_input&0xFC) == 0xF8) {
return 5;
}
if((_input&0xFC) == 0xFC) {
return 5;
}
2017-08-24 22:27:27 +02:00
return 1;
}
bool utf8::first(const char _input) {
// When started with the bit 0 then the size is single element.
if((_input&0x80) == 0x00 ) {
return true;
}
// for multiple element size, we just need to check the second element (might be != 1)
if((_input&0x40) == 0x40 ) {
return true;
}
return false;
}
2017-08-28 00:02:11 +02:00
etk::UString utf8::convertUnicode(const char* _input) {
if (_input == null) {
2017-08-28 00:02:11 +02:00
return U"";
}
etk::UString out;
char tmpData[20];
int64_t pos = 0;
int64_t inputLen = strlen(_input);
while (pos < inputLen) {
int32_t lenMax = inputLen - pos;
2018-08-23 21:01:35 +02:00
uint8_t tmpPos = sizeElement(&_input[pos], lenMax);
out += utf8::convertChar32(&_input[pos]);
pos += tmpPos;
2017-08-24 22:27:27 +02:00
}
2017-08-28 00:02:11 +02:00
return out;
}
etk::UString utf8::convertUnicode(const etk::String& _input) {
return utf8::convertUnicode(_input.c_str());
}