ewol/Sources/libetk/etk/unicode.cpp

797 lines
23 KiB
C++

/**
*******************************************************************************
* @file unicode.cpp
* @brief Editeur De N'ours : Abstraction Charset layer (Sources)
* @author Edouard DUPIN
* @date 18/01/2012
* @par Project
* Ewol TK
*
* @par Copyright
* Copyright 2011 Edouard DUPIN, all right reserved
*
* This software is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY.
*
* Licence summary :
* You can modify and redistribute the sources code and binaries.
* You can send me the bug-fix
*
* Term of the licence in in the file licence.txt.
*
*******************************************************************************
*/
// cf : http://unicode.org/fr/charts/symbols.html#CombiningDiacriticalMarks
#include <etk/Types.h>
#include <etk/Debug.h>
#include <etk/unicodeTable.h>
#include <etk/unicode.h>
// transform ISO <==> Unicode
void unicode::convertIsoToUnicode(charset_te inputCharset, char input_ISO, uniChar_t & output_Unicode)
{
switch(inputCharset)
{
case EDN_CHARSET_ISO_8859_1: output_Unicode = TableIso8859_1[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_2: output_Unicode = TableIso8859_2[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_3: output_Unicode = TableIso8859_3[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_4: output_Unicode = TableIso8859_4[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_5: output_Unicode = TableIso8859_5[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_6: output_Unicode = TableIso8859_6[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_7: output_Unicode = TableIso8859_7[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_8: output_Unicode = TableIso8859_8[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_9: output_Unicode = TableIso8859_9[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_10: output_Unicode = TableIso8859_10[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_11: output_Unicode = TableIso8859_11[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_13: output_Unicode = TableIso8859_13[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_14: output_Unicode = TableIso8859_14[(uint32_t)input_ISO&0xFF]; break;
case EDN_CHARSET_ISO_8859_15:
output_Unicode = TableIso8859_15[(uint32_t)input_ISO&0xFF];
#if 0
if ((uint32_t)input_ISO&0xFF == 0xb0) {
printf("Change Element : 0x%02x ==> 0x%08x \n", (uint32_t)input_ISO&0xFF, output_Unicode);
}
#endif
break;
default :
TK_WARNING("Unknow charset ... " << inputCharset);
output_Unicode = '?';
break;
}
}
void unicode::convertUnicodeToIso(charset_te inputCharset, uniChar_t input_Unicode, char & output_ISO)
{
const uniChar_t *tmpTable = NULL;
switch(inputCharset)
{
case EDN_CHARSET_ISO_8859_1: tmpTable = TableIso8859_1; break;
case EDN_CHARSET_ISO_8859_2: tmpTable = TableIso8859_2; break;
case EDN_CHARSET_ISO_8859_3: tmpTable = TableIso8859_3; break;
case EDN_CHARSET_ISO_8859_4: tmpTable = TableIso8859_4; break;
case EDN_CHARSET_ISO_8859_5: tmpTable = TableIso8859_5; break;
case EDN_CHARSET_ISO_8859_6: tmpTable = TableIso8859_6; break;
case EDN_CHARSET_ISO_8859_7: tmpTable = TableIso8859_7; break;
case EDN_CHARSET_ISO_8859_8: tmpTable = TableIso8859_8; break;
case EDN_CHARSET_ISO_8859_9: tmpTable = TableIso8859_9; break;
case EDN_CHARSET_ISO_8859_10: tmpTable = TableIso8859_10; break;
case EDN_CHARSET_ISO_8859_11: tmpTable = TableIso8859_11; break;
case EDN_CHARSET_ISO_8859_13: tmpTable = TableIso8859_13; break;
case EDN_CHARSET_ISO_8859_14: tmpTable = TableIso8859_14; break;
case EDN_CHARSET_ISO_8859_15: tmpTable = TableIso8859_15; break;
default :
TK_WARNING("Unknow charset ... " << inputCharset);
output_ISO = '?';
return;
}
int32_t i;
for (i=0; i<256; i++) {
if (tmpTable[i] == input_Unicode) {
output_ISO = (char)i;
return;
}
}
}
int32_t unicode::convertIsoToUnicode(charset_te inputCharset, etk::Vector<char>& input_ISO, etk::Vector<uniChar_t>& output_Unicode)
{
output_Unicode.Clear();
uniChar_t output;
for(int32_t iii=0; iii<input_ISO.Size(); iii++) {
convertIsoToUnicode(inputCharset, (char)input_ISO[iii], output);
output_Unicode.PushBack(output);
}
if (output_Unicode.Size() == 0) {
output_Unicode.PushBack(0);
} else if (output_Unicode[output_Unicode.Size()-1] != 0) {
output_Unicode.PushBack(0);
}
return output_Unicode.Size();
}
int32_t unicode::convertIsoToUnicode(charset_te inputCharset, etk::Vector<int8_t>& input_ISO, etk::Vector<uniChar_t>& output_Unicode)
{
output_Unicode.Clear();
uniChar_t output;
for(int32_t iii=0; iii<input_ISO.Size(); iii++) {
convertIsoToUnicode(inputCharset, (char)input_ISO[iii], output);
output_Unicode.PushBack(output);
}
if (output_Unicode.Size() == 0) {
output_Unicode.PushBack(0);
} else if (output_Unicode[output_Unicode.Size()-1] != 0) {
output_Unicode.PushBack(0);
}
return output_Unicode.Size();
}
int32_t unicode::convertUnicodeToIso(charset_te inputCharset, etk::Vector<uniChar_t>& input_Unicode, etk::Vector<char>& output_ISO)
{
output_ISO.Clear();
char output[10];
for(int32_t iii=0; iii<input_Unicode.Size(); iii++) {
convertUnicodeToUtf8(input_Unicode[iii], output);
char * tmp = output;
while(*tmp != '\0') {
output_ISO.PushBack(*tmp);
tmp++;
}
}
output_ISO.PushBack(0);
return output_ISO.Size();
}
int32_t unicode::convertUnicodeToIso(charset_te inputCharset, etk::Vector<uniChar_t>& input_Unicode, etk::Vector<int8_t>& output_ISO)
{
output_ISO.Clear();
char output[10];
for(int32_t iii=0; iii<input_Unicode.Size(); iii++) {
convertUnicodeToUtf8(input_Unicode[iii], output);
char * tmp = output;
while(*tmp != '\0') {
output_ISO.PushBack(*tmp);
tmp++;
}
}
output_ISO.PushBack(0);
return output_ISO.Size();
}
static uint32_t unicodeToUtf8(uniChar_t value)
{
uint32_t output = 0;
if (127 >= value) {
output = value;
} else if (2047 >= value) {
// output ==> 00000000 00000000 110xxxxx 10xxxxxx
// input ==> -------- -------- -----222 22111111
output = 0x0000C080;
output+= (value & 0x000007C0)<<2;
output+= value & 0x0000003F;
} else if (65535 >= value) {
// output ==> 00000000 1110xxxx 10xxxxxx 10xxxxxx
// input ==> -------- -------- 33332222 22111111
output = 0x00E08080;
output+= (value & 0x0000F000)<<4;
output+= (value & 0x00000FC0)<<2;
output+= value & 0x0000003F;
} else if (1114111 >= value) {
// output ==> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// input ==> -------- ---44433 33332222 22111111
output = 0xF0808080;
output+= (value & 0x001C0000)<<6;
output+= (value & 0x0003F000)<<4;
output+= (value & 0x00000FC0)<<2;
output+= value & 0x0000003F;
} else {
TK_ERROR("NON UTF8 caracter input...");
return 0;
}
//printf("utf8convertion : %d=%08x ==> %08x\n",value, value, output);
return output;
}
// Transform UTF-8 <==> Unicode
void unicode::convertUnicodeToUtf8(uniChar_t input_Unicode, char * output_UTF8)
{
uint32_t value = unicodeToUtf8(input_Unicode);
if (0xFF >= value) {
output_UTF8[0] = (char)value;
output_UTF8[1] = '\0';
} else if (0xFFFF >= value) {
output_UTF8[0] = (char)((value>>8) & 0x000000FF);
output_UTF8[1] = (char)value;
output_UTF8[2] = '\0';
} else if (0xFFFFFF >= value) {
output_UTF8[0] = (char)((value>>16) & 0x000000FF);
output_UTF8[1] = (char)((value>>8) & 0x000000FF);
output_UTF8[2] = (char)value;
output_UTF8[3] = '\0';
} else {
output_UTF8[0] = (char)((value>>24) & 0x000000FF);
output_UTF8[1] = (char)((value>>16) & 0x000000FF);
output_UTF8[2] = (char)((value>>8) & 0x000000FF);
output_UTF8[3] = (char)value;
output_UTF8[4] = '\0';
}
}
void unicode::convertUtf8ToUnicode(char * input_UTF8, uniChar_t &output_Unicode)
{
int32_t len = strlen(input_UTF8);
output_Unicode = 0;
switch (len) {
case 1:
output_Unicode = (uint8_t)(input_UTF8[0]) & 0x7F;
break;
case 2:
output_Unicode = (((uint8_t)input_UTF8[0]) & 0x1F)<< 6;
output_Unicode += ((uint8_t)input_UTF8[1]) & 0x3F;
break;
case 3:
output_Unicode = (((uint8_t)input_UTF8[0]) & 0x0F)<< 12;
output_Unicode += (((uint8_t)input_UTF8[1]) & 0x3F)<< 6;
output_Unicode += ((uint8_t)input_UTF8[2]) & 0x3F;
break;
default:
output_Unicode = (((uint8_t)input_UTF8[0]) & 0x07)<< 18;
output_Unicode += (((uint8_t)input_UTF8[1]) & 0x3F)<< 12;
output_Unicode += (((uint8_t)input_UTF8[2]) & 0x3F)<< 6;
output_Unicode += ((uint8_t)input_UTF8[3]) & 0x3F;
break;
}
}
int32_t unicode::convertUnicodeToUtf8(const etk::Vector<uniChar_t>& input_Unicode, etk::Vector<char>& output_UTF8)
{
char output[10];
for (int32_t iii=0; iii<input_Unicode.Size(); iii++) {
unicode::convertUnicodeToUtf8(input_Unicode[iii], output);
char * tmp = output ;
while (*tmp != '\0') {
output_UTF8.PushBack(*tmp);
tmp++;
}
}
output_UTF8.PushBack('\0');
return output_UTF8.Size()-1;
}
int32_t unicode::convertUnicodeToUtf8(const etk::Vector<uniChar_t>& input_Unicode, etk::Vector<int8_t>& output_UTF8)
{
char output[10];
for (int32_t iii=0; iii<input_Unicode.Size(); iii++) {
unicode::convertUnicodeToUtf8(input_Unicode[iii], output);
char * tmp = output ;
while (*tmp != '\0') {
output_UTF8.PushBack((int8_t)*tmp);
tmp++;
}
}
output_UTF8.PushBack('\0');
return output_UTF8.Size()-1;
}
int32_t unicode::convertUtf8ToUnicode(etk::Vector<char>& input_UTF8, etk::Vector<uniChar_t>& output_Unicode)
{
char tmpData[20];
int32_t pos = 0;
while (pos < input_UTF8.Size()) {
int32_t lenMax = input_UTF8.Size() - pos;
//4 case
if( 1<=lenMax
&& 0x00 == (input_UTF8[pos+0] & 0x80) )
{
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = '\0';
pos += 1;
} else if( 2<=lenMax
&& 0xC0 == (input_UTF8[pos+0] & 0xE0)
&& 0x80 == (input_UTF8[pos+1] & 0xC0) ) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = '\0';
pos += 2;
} else if( 3<=lenMax
&& 0xE0 == (input_UTF8[pos+0] & 0xF0)
&& 0x80 == (input_UTF8[pos+1] & 0xC0)
&& 0x80 == (input_UTF8[pos+2] & 0xC0)) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = input_UTF8[pos+2];
tmpData[3] = '\0';
pos += 3;
} else if( 4<=lenMax
&& 0xF0 == (input_UTF8[pos+0] & 0xF8)
&& 0x80 == (input_UTF8[pos+1] & 0xC0)
&& 0x80 == (input_UTF8[pos+2] & 0xC0)
&& 0x80 == (input_UTF8[pos+3] & 0xC0)) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = input_UTF8[pos+2];
tmpData[3] = input_UTF8[pos+3];
tmpData[4] = '\0';
pos += 4;
} else {
tmpData[0] = '\0';
pos += 1;
}
uniChar_t tmpUnicode;
convertUtf8ToUnicode(tmpData, tmpUnicode);
output_Unicode.PushBack(tmpUnicode);
}
return 0;
}
int32_t unicode::convertUtf8ToUnicode(etk::Vector<int8_t>& input_UTF8, etk::Vector<uniChar_t>& output_Unicode)
{
char tmpData[20];
int32_t pos = 0;
while (pos < input_UTF8.Size()) {
int32_t lenMax = input_UTF8.Size() - pos;
//4 case
if( 1<=lenMax
&& 0x00 == (input_UTF8[pos+0] & 0x80) )
{
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = '\0';
pos += 1;
} else if( 2<=lenMax
&& 0xC0 == (input_UTF8[pos+0] & 0xE0)
&& 0x80 == (input_UTF8[pos+1] & 0xC0) ) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = '\0';
pos += 2;
} else if( 3<=lenMax
&& 0xE0 == (input_UTF8[pos+0] & 0xF0)
&& 0x80 == (input_UTF8[pos+1] & 0xC0)
&& 0x80 == (input_UTF8[pos+2] & 0xC0)) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = input_UTF8[pos+2];
tmpData[3] = '\0';
pos += 3;
} else if( 4<=lenMax
&& 0xF0 == (input_UTF8[pos+0] & 0xF8)
&& 0x80 == (input_UTF8[pos+1] & 0xC0)
&& 0x80 == (input_UTF8[pos+2] & 0xC0)
&& 0x80 == (input_UTF8[pos+3] & 0xC0)) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = input_UTF8[pos+2];
tmpData[3] = input_UTF8[pos+3];
tmpData[4] = '\0';
pos += 4;
} else {
tmpData[0] = '\0';
pos += 1;
}
uniChar_t tmpUnicode;
convertUtf8ToUnicode(tmpData, tmpUnicode);
output_Unicode.PushBack(tmpUnicode);
}
return 0;
}
int32_t unicode::convertUtf8ToUnicode(char * input_UTF8, etk::Vector<uniChar_t>& output_Unicode)
{
char tmpData[20];
int32_t pos = 0;
if (NULL == input_UTF8) {
return 0;
}
int32_t len = strlen(input_UTF8);
while (pos < len) {
int32_t lenMax = len - pos;
//4 case
if( 1<=lenMax
&& 0x00 == (input_UTF8[pos+0] & 0x80) )
{
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = '\0';
pos += 1;
} else if( 2<=lenMax
&& 0xC0 == (input_UTF8[pos+0] & 0xE0)
&& 0x80 == (input_UTF8[pos+1] & 0xC0) ) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = '\0';
pos += 2;
} else if( 3<=lenMax
&& 0xE0 == (input_UTF8[pos+0] & 0xF0)
&& 0x80 == (input_UTF8[pos+1] & 0xC0)
&& 0x80 == (input_UTF8[pos+2] & 0xC0)) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = input_UTF8[pos+2];
tmpData[3] = '\0';
pos += 3;
} else if( 4<=lenMax
&& 0xF0 == (input_UTF8[pos+0] & 0xF8)
&& 0x80 == (input_UTF8[pos+1] & 0xC0)
&& 0x80 == (input_UTF8[pos+2] & 0xC0)
&& 0x80 == (input_UTF8[pos+3] & 0xC0)) {
tmpData[0] = input_UTF8[pos+0];
tmpData[1] = input_UTF8[pos+1];
tmpData[2] = input_UTF8[pos+2];
tmpData[3] = input_UTF8[pos+3];
tmpData[4] = '\0';
pos += 4;
} else {
tmpData[0] = '\0';
pos += 1;
}
uniChar_t tmpUnicode;
convertUtf8ToUnicode(tmpData, tmpUnicode);
output_Unicode.PushBack(tmpUnicode);
}
return 0;
}
// Transform ISO <==> UTF-8
void unicode::convertIsoToUtf8(charset_te inputCharset, char input_ISO, char * output_UTF8)
{
uniChar_t tmpUnicode;
// concert Iso in UniCode
convertIsoToUnicode(inputCharset, input_ISO, tmpUnicode );
// convert UniCode in Utf-8
convertUnicodeToUtf8(tmpUnicode, output_UTF8);
}
void unicode::convertUtf8ToIso(charset_te inputCharset, char * input_UTF8, char & output_ISO)
{
uniChar_t tmpUnicode;
// convert Utf-8 in UniCode
convertUtf8ToUnicode(input_UTF8, tmpUnicode);
// concert UniCode in Iso
convertUnicodeToIso(inputCharset, tmpUnicode, output_ISO);
}
int32_t unicode::convertIsoToUtf8(charset_te inputCharset, etk::Vector<char>& input_ISO, etk::Vector<char>& output_UTF8)
{
TK_WARNING("TODO : not coded...");
return 0;
}
int32_t unicode::convertUtf8ToIso(charset_te inputCharset, etk::Vector<char>& input_UTF8, etk::Vector<char>& output_ISO)
{
TK_WARNING("TODO : not coded...");
return 0;
}
/**
* @brief Get the number of element of the curent UTF8 char (in the curent Buffer)
*
* @param[in] data pointer on the curent CHAR string (pointer on the allocated buffer)
* @param[out] size Nb of char use in this UTF8 [0..4]
* @param[out] baseValid true : the ase format of the UTF8 is CORRECT
*
* @return ---
*
*/
void unicode::Utf8_SizeElement(const char * data, int32_t lenMax , uint8_t &size, bool &baseValid)
{
TK_ASSERT(0 <= lenMax, "size can not be < 0 ...");
if (0 > lenMax) {
size = 0;
baseValid = false;
return;
}
//4 case
if ( 1<=lenMax
&& 0x00 == (data[0] & 0x80) ) {
// One Char Element
size = 1;
baseValid = true;
} else if( 2<=lenMax
&& 0xC0 == (data[0] & 0xE0)
&& 0x80 == (data[1] & 0xC0) ) {
size = 2;
baseValid = true;
} else if( 3<=lenMax
&& 0xE0 == (data[0] & 0xF0)
&& 0x80 == (data[1] & 0xC0)
&& 0x80 == (data[2] & 0xC0)) {
size = 3;
baseValid = true;
} else if( 4<=lenMax
&& 0xF0 == (data[0] & 0xF8)
&& 0x80 == (data[1] & 0xC0)
&& 0x80 == (data[2] & 0xC0)
&& 0x80 == (data[3] & 0xC0)) {
size = 4;
baseValid = true;
} else {
// return only one error Caracter ...
baseValid = false;
size = 1;
}
}
#if 0 // Remove for the moment ...
/**
* @brief Get the number of element of the previous UTF8 char (in the curent Buffer)
*
* @param[in] data pointer on the curent CHAR string (pointer on the allocated buffer) (the curent char is not check)
* @param[out] size Nb of char use in this UTF8 [0..4]
* @param[out] baseValid true : the ase format of the UTF8 is CORRECT
*
* @return ---
*
*/
static void Utf8_SizePreviousElement(const char * data, int32_t lenMax, uint8_t &size, bool &baseValid)
{
EDN_ASSERT(0 <= lenMax, "size can not be < 0 ...");
if (0 > lenMax) {
size = 0;
baseValid = false;
return;
}
//4 case
if ( 1<=lenMax
&& 0x00 == (data[-1] & 0x80) ) {
// One Char Element
size = 1;
baseValid = true;
} else if( 2<=lenMax
&& 0xC0 == (data[-2] & 0xE0)
&& 0x80 == (data[-1] & 0xC0) ) {
size = 2;
baseValid = true;
} else if( 3<=lenMax
&& 0xE0 == (data[-3] & 0xF0)
&& 0x80 == (data[-2] & 0xC0)
&& 0x80 == (data[-1] & 0xC0)) {
size = 3;
baseValid = true;
} else if( 4<=lenMax
&& 0xF0 == (data[-4] & 0xF8)
&& 0x80 == (data[-3] & 0xC0)
&& 0x80 == (data[-2] & 0xC0)
&& 0x80 == (data[-1] & 0xC0)) {
size = 4;
baseValid = true;
} else {
// return only one error Caracter ...
baseValid = false;
size = 1;
}
}
#endif
/**
* @brief
*
* @param[in,out]
*
* @return
*
*/
/*
static uint32_t Utf8_GetValue(UTF8Element_ts &Element)
{
uint32_t value = 0;
const char * data = m_data + Element.CharPosition;
//4 case
switch(Element.CharSize)
{
case 1:
value = data[0] & 0x7F;
break;
case 2:
value = (data[0] & 0x1F)<< 6;
value += data[1] & 0x3F;
break;
case 3:
value = (data[0] & 0x0F)<< 12;
value += (data[1] & 0x3F)<< 6;
value += data[2] & 0x3F;
break;
case 4:
value = (data[0] & 0x07)<< 18;
value += (data[1] & 0x3F)<< 12;
value += (data[2] & 0x3F)<< 6;
value += data[3] & 0x3F;
break;
default:
// return only one error Caracter ...
EDN_ASSERT(false, "impossible case....");
break;
}
// check the validity of the UTF8 ...
if( ( 0xD800 <= value
&& 0xDFFF >= value )
|| ( 0xFDD0 <= value
&& 0xFDEF >= value )
|| ( 0xFFFE <= value
&& 0xFFFF >= value )
|| ( 0x1FFFE <= value
&& 0x1FFFF >= value )
|| ( 0x2FFFE <= value
&& 0xDFFFF >= value )
|| ( 0xEFFFE <= value
&& 0xEFFFF >= value )
|| ( 0xFFFFE <= value
&& 0xFFFFF >= value )
|| ( 0x10FFFE <= value
&& 0x10FFFF >= value ) )
{
// overwrite the UTF8 validity ==> this is not a diaplayable element
Element.ValidUTF8 = false;
return value;
}
return value;
}
*/
int32_t unicode::strUtf8Len(const char *input_UTF8)
{
int32_t count = 0;
int32_t size = strlen(input_UTF8);
uint8_t tmpSize;
bool baseValid;
while (size > 0) {
Utf8_SizeElement(input_UTF8, size , tmpSize, baseValid);
input_UTF8 += tmpSize;
size -= tmpSize;
count++;
}
return count;
}
// **************************************************************************************************************
// simple convertion optention
// **************************************************************************************************************
#if 0
Procedure de recuperation des charset sans ce casser les ...
// generate the basic file
FILE * mfile = NULL;
mfile = fopen("fichierIsoBase", "wb");
if (NULL == mfile) {
EDN_ERROR("Error to create file");
return false;
}
char newline = '\n';
for(int32_t i=0x20; i<0x100; i++) {
char plop = i;
fwrite(&plop, sizeof(char), 1, mfile);
fwrite(&newline, sizeof(char), 1, mfile);
}
fclose(mfile);
// console script to convert files :
iconv -c --from-code=ISO-8859-1 --to-code=UTF-8 -o fichierUTF8_iso-1 fichierIsoBase
iconv -c --from-code=ISO-8859-2 --to-code=UTF-8 -o fichierUTF8_iso-2 fichierIsoBase
iconv -c --from-code=ISO-8859-3 --to-code=UTF-8 -o fichierUTF8_iso-3 fichierIsoBase
iconv -c --from-code=ISO-8859-4 --to-code=UTF-8 -o fichierUTF8_iso-4 fichierIsoBase
iconv -c --from-code=ISO-8859-5 --to-code=UTF-8 -o fichierUTF8_iso-5 fichierIsoBase
iconv -c --from-code=ISO-8859-6 --to-code=UTF-8 -o fichierUTF8_iso-6 fichierIsoBase
iconv -c --from-code=ISO-8859-7 --to-code=UTF-8 -o fichierUTF8_iso-7 fichierIsoBase
iconv -c --from-code=ISO-8859-8 --to-code=UTF-8 -o fichierUTF8_iso-8 fichierIsoBase
iconv -c --from-code=ISO-8859-9 --to-code=UTF-8 -o fichierUTF8_iso-9 fichierIsoBase
iconv -c --from-code=ISO-8859-10 --to-code=UTF-8 -o fichierUTF8_iso-10 fichierIsoBase
iconv -c --from-code=ISO-8859-11 --to-code=UTF-8 -o fichierUTF8_iso-11 fichierIsoBase
iconv -c --from-code=ISO-8859-12 --to-code=UTF-8 -o fichierUTF8_iso-12 fichierIsoBase
iconv -c --from-code=ISO-8859-13 --to-code=UTF-8 -o fichierUTF8_iso-13 fichierIsoBase
iconv -c --from-code=ISO-8859-14 --to-code=UTF-8 -o fichierUTF8_iso-14 fichierIsoBase
iconv -c --from-code=ISO-8859-15 --to-code=UTF-8 -o fichierUTF8_iso-15 fichierIsoBase
// NOTE : Le format 12 n'existe pas ...
FILE * mfileout = NULL;
mfileout = fopen("outputGeneration.c", "wb");
if (NULL == mfileout) {
EDN_ERROR("Error to create file");
return false;
}
char * inputFileData[] = {
"fichierUTF8_iso-1",
"fichierUTF8_iso-2",
// "fichierUTF8_iso-3",
"fichierUTF8_iso-4",
"fichierUTF8_iso-5",
/* "fichierUTF8_iso-6",
"fichierUTF8_iso-7",
"fichierUTF8_iso-8",
"fichierUTF8_iso-9",
"fichierUTF8_iso-10",
"fichierUTF8_iso-11",
"fichierUTF8_iso-13",
"fichierUTF8_iso-14",
*/
"fichierUTF8_iso-15"
};
for (int32_t k=0; k<5; k++) {
FILE * mfile = NULL;
mfile = fopen(inputFileData[k], "rb");
if (NULL == mfile) {
EDN_ERROR("Error to open file");
return false;
}
char data[255] ;
fprintf(mfileout, "\tTYPESTRUCT_TS %s[] = {\n\t\t", inputFileData[k]);
for(int32_t i=0x0; i<0x10; i++) {
fprintf(mfileout, "0x%08X, ", i);
}
fprintf(mfileout, "\n\t\t");
for(int32_t i=0x10; i<0x20; i++) {
fprintf(mfileout, "0x%08X, ", i);
}
for(int32_t i=0x20; i<0x100; i++) {
if (0==i%16) {
fprintf(mfileout, "\n\t\t");
}
fgets(data, 25, mfile );
data[strlen(data)-1] = '\0';
EDN_INFO("sizeofLine=" << strlen(data) << " data=\"" << data << "\"");
// convert in int :
int32_t valUTF8 = 0;
int32_t valUnicode = 0;
switch (strlen(data)) {
case 1:
valUTF8 = (uint8_t) (data[0]);
valUnicode = (uint8_t)(data[0]) & 0x7F;
break;
case 2:
valUTF8 = (uint8_t) (data[0]) << 8;
valUTF8 += (uint8_t) (data[1]);
valUnicode = (((uint8_t)data[0]) & 0x1F)<< 6;
valUnicode += ((uint8_t)data[1]) & 0x3F;
break;
case 3:
valUTF8 = (uint8_t) (data[0]) << 16;
valUTF8 += (uint8_t) (data[1]) << 8;
valUTF8 += (uint8_t) (data[2]);
valUnicode = (((uint8_t)data[0]) & 0x0F)<< 12;
valUnicode += (((uint8_t)data[1]) & 0x3F)<< 6;
valUnicode += ((uint8_t)data[2]) & 0x3F;
break;
default:
valUTF8 = (uint8_t) (data[0]) <<24;
valUTF8 += (uint8_t) (data[1]) << 16;
valUTF8 += (uint8_t) (data[2]) << 8;
valUTF8 += (uint8_t) (data[3]);
valUnicode = (((uint8_t)data[0]) & 0x07)<< 18;
valUnicode += (((uint8_t)data[1]) & 0x3F)<< 12;
valUnicode += (((uint8_t)data[2]) & 0x3F)<< 6;
valUnicode += ((uint8_t)data[3]) & 0x3F;
break;
}
fprintf(mfileout, "0x%08X, ", valUnicode);
}
fprintf(mfileout, "\n\t};\n\n");
fclose(mfile);
}
fclose(mfileout);
#endif