icu/source/i18n/nortrans.cpp

179 lines
6.3 KiB
C++

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/03/01 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/normalizer2.h"
#include "unicode/utf16.h"
#include "cstring.h"
#include "nortrans.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
static inline Transliterator::Token cstrToken(const char *s) {
return Transliterator::pointerToken((void *)s);
}
/**
* System registration hook.
*/
void NormalizationTransliterator::registerIDs() {
// In the Token, the byte after the NUL is the UNormalization2Mode.
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
_create, cstrToken("nfc\0\0"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
_create, cstrToken("nfkc\0\0"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
_create, cstrToken("nfc\0\1"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
_create, cstrToken("nfkc\0\1"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
_create, cstrToken("nfc\0\2"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
_create, cstrToken("nfc\0\3"));
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
UNICODE_STRING_SIMPLE("NFD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
UNICODE_STRING_SIMPLE("NFD"), FALSE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
UNICODE_STRING_SIMPLE("FCD"), FALSE);
}
/**
* Factory methods
*/
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
Token context) {
const char *name = (const char *)context.pointer;
UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
UErrorCode errorCode = U_ZERO_ERROR;
const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
if(U_SUCCESS(errorCode)) {
return new NormalizationTransliterator(ID, *norm2);
} else {
return NULL;
}
}
/**
* Constructs a transliterator.
*/
NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
const Normalizer2 &norm2) :
Transliterator(id, 0), fNorm2(norm2) {}
/**
* Destructor.
*/
NormalizationTransliterator::~NormalizationTransliterator() {
}
/**
* Copy constructor.
*/
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o), fNorm2(o.fNorm2) {}
/**
* Transliterator API.
*/
Transliterator* NormalizationTransliterator::clone(void) const {
return new NormalizationTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental) const {
// start and limit of the input range
int32_t start = offsets.start;
int32_t limit = offsets.limit;
if(start >= limit) {
return;
}
/*
* Normalize as short chunks at a time as possible even in
* bulk mode, so that styled text is minimally disrupted.
* In incremental mode, a chunk that ends with offsets.limit
* must not be normalized.
*
* If it was known that the input text is not styled, then
* a bulk mode normalization could look like this:
UnicodeString input, normalized;
int32_t length = limit - start;
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
input.releaseBuffer(length);
UErrorCode status = U_ZERO_ERROR;
fNorm2.normalize(input, normalized, status);
text.handleReplaceBetween(start, limit, normalized);
int32_t delta = normalized.length() - length;
offsets.contextLimit += delta;
offsets.limit += delta;
offsets.start = limit + delta;
*/
UErrorCode errorCode = U_ZERO_ERROR;
UnicodeString segment;
UnicodeString normalized;
UChar32 c = text.char32At(start);
do {
int32_t prev = start;
// Skip at least one character so we make progress.
// c holds the character at start.
segment.remove();
do {
segment.append(c);
start += U16_LENGTH(c);
} while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
// stop in incremental mode when we reach the input limit
// in case there are additional characters that could change the
// normalization result
start=prev;
break;
}
fNorm2.normalize(segment, normalized, errorCode);
if(U_FAILURE(errorCode)) {
break;
}
if(segment != normalized) {
// replace the input chunk with its normalized form
text.handleReplaceBetween(prev, start, normalized);
// update all necessary indexes accordingly
int32_t delta = normalized.length() - (start - prev);
start += delta;
limit += delta;
}
} while(start < limit);
offsets.start = start;
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */