icu/source/i18n/csdetect.cpp

488 lines
14 KiB
C++

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "unicode/ucsdet.h"
#include "csdetect.h"
#include "csmatch.h"
#include "uenumimp.h"
#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
#include "ucln_in.h"
#include "uarrsort.h"
#include "inputext.h"
#include "csrsbcs.h"
#include "csrmbcs.h"
#include "csrutf8.h"
#include "csrucode.h"
#include "csr2022.h"
#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))
U_NAMESPACE_BEGIN
struct CSRecognizerInfo : public UMemory {
CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
: recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
~CSRecognizerInfo() {delete recognizer;};
CharsetRecognizer *recognizer;
UBool isDefaultEnabled;
};
U_NAMESPACE_END
static icu::CSRecognizerInfo **fCSRecognizers = NULL;
static icu::UInitOnce gCSRecognizersInitOnce;
static int32_t fCSRecognizers_size = 0;
U_CDECL_BEGIN
static UBool U_CALLCONV csdet_cleanup(void)
{
U_NAMESPACE_USE
if (fCSRecognizers != NULL) {
for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
delete fCSRecognizers[r];
fCSRecognizers[r] = NULL;
}
DELETE_ARRAY(fCSRecognizers);
fCSRecognizers = NULL;
fCSRecognizers_size = 0;
}
gCSRecognizersInitOnce.reset();
return TRUE;
}
static int32_t U_CALLCONV
charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
{
U_NAMESPACE_USE
const CharsetMatch **csm_l = (const CharsetMatch **) left;
const CharsetMatch **csm_r = (const CharsetMatch **) right;
// NOTE: compare is backwards to sort from highest to lowest.
return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
}
static void U_CALLCONV initRecognizers(UErrorCode &status) {
U_NAMESPACE_USE
ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
CSRecognizerInfo *tempArray[] = {
new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
#if !UCONFIG_ONLY_HTML_CONVERSION
new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
#endif
};
int32_t rCount = UPRV_LENGTHOF(tempArray);
fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
if (fCSRecognizers == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
else {
fCSRecognizers_size = rCount;
for (int32_t r = 0; r < rCount; r += 1) {
fCSRecognizers[r] = tempArray[r];
if (fCSRecognizers[r] == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
void CharsetDetector::setRecognizers(UErrorCode &status)
{
umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
}
CharsetDetector::CharsetDetector(UErrorCode &status)
: textIn(new InputText(status)), resultArray(NULL),
resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
fEnabledRecognizers(NULL)
{
if (U_FAILURE(status)) {
return;
}
setRecognizers(status);
if (U_FAILURE(status)) {
return;
}
resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
if (resultArray == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
resultArray[i] = new CharsetMatch();
if (resultArray[i] == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
break;
}
}
}
CharsetDetector::~CharsetDetector()
{
delete textIn;
for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
delete resultArray[i];
}
uprv_free(resultArray);
if (fEnabledRecognizers) {
uprv_free(fEnabledRecognizers);
}
}
void CharsetDetector::setText(const char *in, int32_t len)
{
textIn->setText(in, len);
fFreshTextSet = TRUE;
}
UBool CharsetDetector::setStripTagsFlag(UBool flag)
{
UBool temp = fStripTags;
fStripTags = flag;
fFreshTextSet = TRUE;
return temp;
}
UBool CharsetDetector::getStripTagsFlag() const
{
return fStripTags;
}
void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
{
textIn->setDeclaredEncoding(encoding,len);
}
int32_t CharsetDetector::getDetectableCount()
{
UErrorCode status = U_ZERO_ERROR;
setRecognizers(status);
return fCSRecognizers_size;
}
const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
{
int32_t maxMatchesFound = 0;
detectAll(maxMatchesFound, status);
if(maxMatchesFound > 0) {
return resultArray[0];
} else {
return NULL;
}
}
const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
{
if(!textIn->isSet()) {
status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
return NULL;
} else if (fFreshTextSet) {
CharsetRecognizer *csr;
int32_t i;
textIn->MungeInput(fStripTags);
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
resultCount = 0;
for (i = 0; i < fCSRecognizers_size; i += 1) {
csr = fCSRecognizers[i]->recognizer;
if (csr->match(textIn, resultArray[resultCount])) {
resultCount++;
}
}
if (resultCount > 1) {
uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
}
fFreshTextSet = FALSE;
}
maxMatchesFound = resultCount;
return resultArray;
}
void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
int32_t modIdx = -1;
UBool isDefaultVal = FALSE;
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
CSRecognizerInfo *csrinfo = fCSRecognizers[i];
if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
modIdx = i;
isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
break;
}
}
if (modIdx < 0) {
// No matching encoding found
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (fEnabledRecognizers == NULL && !isDefaultVal) {
// Create an array storing the non default setting
fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
if (fEnabledRecognizers == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
// Initialize the array with default info
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
}
}
if (fEnabledRecognizers != NULL) {
fEnabledRecognizers[modIdx] = enabled;
}
}
/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
{
if( index > fCSRecognizers_size-1 || index < 0) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
} else {
return fCSRecognizers[index]->getName();
}
}*/
U_NAMESPACE_END
U_CDECL_BEGIN
typedef struct {
int32_t currIndex;
UBool all;
UBool *enabledRecognizers;
} Context;
static void U_CALLCONV
enumClose(UEnumeration *en) {
if(en->context != NULL) {
DELETE_ARRAY(en->context);
}
DELETE_ARRAY(en);
}
static int32_t U_CALLCONV
enumCount(UEnumeration *en, UErrorCode *) {
if (((Context *)en->context)->all) {
// ucsdet_getAllDetectableCharsets, all charset detector names
return fCSRecognizers_size;
}
// Otherwise, ucsdet_getDetectableCharsets - only enabled ones
int32_t count = 0;
UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
if (enabledArray != NULL) {
// custom set
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
if (enabledArray[i]) {
count++;
}
}
} else {
// default set
for (int32_t i = 0; i < fCSRecognizers_size; i++) {
if (fCSRecognizers[i]->isDefaultEnabled) {
count++;
}
}
}
return count;
}
static const char* U_CALLCONV
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
const char *currName = NULL;
if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
if (((Context *)en->context)->all) {
// ucsdet_getAllDetectableCharsets, all charset detector names
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
((Context *)en->context)->currIndex++;
} else {
// ucsdet_getDetectableCharsets
UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
if (enabledArray != NULL) {
// custome set
while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
if (enabledArray[((Context *)en->context)->currIndex]) {
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
}
((Context *)en->context)->currIndex++;
}
} else {
// default set
while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
}
((Context *)en->context)->currIndex++;
}
}
}
}
if(resultLength != NULL) {
*resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
}
return currName;
}
static void U_CALLCONV
enumReset(UEnumeration *en, UErrorCode *) {
((Context *)en->context)->currIndex = 0;
}
static const UEnumeration gCSDetEnumeration = {
NULL,
NULL,
enumClose,
enumCount,
uenum_unextDefault,
enumNext,
enumReset
};
U_CDECL_END
U_NAMESPACE_BEGIN
UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
{
/* Initialize recognized charsets. */
setRecognizers(status);
if(U_FAILURE(status)) {
return 0;
}
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
if (en == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
en->context = (void*)NEW_ARRAY(Context, 1);
if (en->context == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
DELETE_ARRAY(en);
return 0;
}
uprv_memset(en->context, 0, sizeof(Context));
((Context*)en->context)->all = TRUE;
return en;
}
UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
{
if(U_FAILURE(status)) {
return 0;
}
UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
if (en == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
en->context = (void*)NEW_ARRAY(Context, 1);
if (en->context == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
DELETE_ARRAY(en);
return 0;
}
uprv_memset(en->context, 0, sizeof(Context));
((Context*)en->context)->all = FALSE;
((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
return en;
}
U_NAMESPACE_END
#endif