474 lines
15 KiB
C++
474 lines
15 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2014, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
*******************************************************************************
|
|
*/
|
|
|
|
/*
|
|
* File coleitr.cpp
|
|
*
|
|
* Created by: Helena Shih
|
|
*
|
|
* Modification History:
|
|
*
|
|
* Date Name Description
|
|
*
|
|
* 6/23/97 helena Adding comments to make code more readable.
|
|
* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
|
|
* 12/10/99 aliu Ported Thai collation support from Java.
|
|
* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
|
|
* 02/19/01 swquek Removed CollationElementIterator() since it is
|
|
* private constructor and no calls are made to it
|
|
* 2012-2014 markus Rewritten in C++ again.
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
#include "unicode/chariter.h"
|
|
#include "unicode/coleitr.h"
|
|
#include "unicode/tblcoll.h"
|
|
#include "unicode/ustring.h"
|
|
#include "cmemory.h"
|
|
#include "collation.h"
|
|
#include "collationdata.h"
|
|
#include "collationiterator.h"
|
|
#include "collationsets.h"
|
|
#include "collationtailoring.h"
|
|
#include "uassert.h"
|
|
#include "uhash.h"
|
|
#include "utf16collationiterator.h"
|
|
#include "uvectr32.h"
|
|
|
|
/* Constants --------------------------------------------------------------- */
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
|
|
|
|
/* CollationElementIterator public constructor/destructor ------------------ */
|
|
|
|
CollationElementIterator::CollationElementIterator(
|
|
const CollationElementIterator& other)
|
|
: UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
|
|
*this = other;
|
|
}
|
|
|
|
CollationElementIterator::~CollationElementIterator()
|
|
{
|
|
delete iter_;
|
|
delete offsets_;
|
|
}
|
|
|
|
/* CollationElementIterator public methods --------------------------------- */
|
|
|
|
namespace {
|
|
|
|
uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
|
|
return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
|
|
}
|
|
uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
|
|
return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
|
|
}
|
|
UBool ceNeedsTwoParts(int64_t ce) {
|
|
return (ce & INT64_C(0xffff00ff003f)) != 0;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
int32_t CollationElementIterator::getOffset() const
|
|
{
|
|
if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
|
|
// CollationIterator::previousCE() decrements the CEs length
|
|
// while it pops CEs from its internal buffer.
|
|
int32_t i = iter_->getCEsLength();
|
|
if (otherHalf_ != 0) {
|
|
// Return the trailing CE offset while we are in the middle of a 64-bit CE.
|
|
++i;
|
|
}
|
|
U_ASSERT(i < offsets_->size());
|
|
return offsets_->elementAti(i);
|
|
}
|
|
return iter_->getOffset();
|
|
}
|
|
|
|
/**
|
|
* Get the ordering priority of the next character in the string.
|
|
* @return the next character's ordering. Returns NULLORDER if an error has
|
|
* occured or if the end of string has been reached
|
|
*/
|
|
int32_t CollationElementIterator::next(UErrorCode& status)
|
|
{
|
|
if (U_FAILURE(status)) { return NULLORDER; }
|
|
if (dir_ > 1) {
|
|
// Continue forward iteration. Test this first.
|
|
if (otherHalf_ != 0) {
|
|
uint32_t oh = otherHalf_;
|
|
otherHalf_ = 0;
|
|
return oh;
|
|
}
|
|
} else if (dir_ == 1) {
|
|
// next() after setOffset()
|
|
dir_ = 2;
|
|
} else if (dir_ == 0) {
|
|
// The iter_ is already reset to the start of the text.
|
|
dir_ = 2;
|
|
} else /* dir_ < 0 */ {
|
|
// illegal change of direction
|
|
status = U_INVALID_STATE_ERROR;
|
|
return NULLORDER;
|
|
}
|
|
// No need to keep all CEs in the buffer when we iterate.
|
|
iter_->clearCEsIfNoneRemaining();
|
|
int64_t ce = iter_->nextCE(status);
|
|
if (ce == Collation::NO_CE) { return NULLORDER; }
|
|
// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
|
|
uint32_t p = (uint32_t)(ce >> 32);
|
|
uint32_t lower32 = (uint32_t)ce;
|
|
uint32_t firstHalf = getFirstHalf(p, lower32);
|
|
uint32_t secondHalf = getSecondHalf(p, lower32);
|
|
if (secondHalf != 0) {
|
|
otherHalf_ = secondHalf | 0xc0; // continuation CE
|
|
}
|
|
return firstHalf;
|
|
}
|
|
|
|
UBool CollationElementIterator::operator!=(
|
|
const CollationElementIterator& other) const
|
|
{
|
|
return !(*this == other);
|
|
}
|
|
|
|
UBool CollationElementIterator::operator==(
|
|
const CollationElementIterator& that) const
|
|
{
|
|
if (this == &that) {
|
|
return TRUE;
|
|
}
|
|
|
|
return
|
|
(rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
|
|
otherHalf_ == that.otherHalf_ &&
|
|
normalizeDir() == that.normalizeDir() &&
|
|
string_ == that.string_ &&
|
|
*iter_ == *that.iter_;
|
|
}
|
|
|
|
/**
|
|
* Get the ordering priority of the previous collation element in the string.
|
|
* @param status the error code status.
|
|
* @return the previous element's ordering. Returns NULLORDER if an error has
|
|
* occured or if the start of string has been reached.
|
|
*/
|
|
int32_t CollationElementIterator::previous(UErrorCode& status)
|
|
{
|
|
if (U_FAILURE(status)) { return NULLORDER; }
|
|
if (dir_ < 0) {
|
|
// Continue backwards iteration. Test this first.
|
|
if (otherHalf_ != 0) {
|
|
uint32_t oh = otherHalf_;
|
|
otherHalf_ = 0;
|
|
return oh;
|
|
}
|
|
} else if (dir_ == 0) {
|
|
iter_->resetToOffset(string_.length());
|
|
dir_ = -1;
|
|
} else if (dir_ == 1) {
|
|
// previous() after setOffset()
|
|
dir_ = -1;
|
|
} else /* dir_ > 1 */ {
|
|
// illegal change of direction
|
|
status = U_INVALID_STATE_ERROR;
|
|
return NULLORDER;
|
|
}
|
|
if (offsets_ == NULL) {
|
|
offsets_ = new UVector32(status);
|
|
if (offsets_ == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return NULLORDER;
|
|
}
|
|
}
|
|
// If we already have expansion CEs, then we also have offsets.
|
|
// Otherwise remember the trailing offset in case we need to
|
|
// write offsets for an artificial expansion.
|
|
int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
|
|
int64_t ce = iter_->previousCE(*offsets_, status);
|
|
if (ce == Collation::NO_CE) { return NULLORDER; }
|
|
// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
|
|
uint32_t p = (uint32_t)(ce >> 32);
|
|
uint32_t lower32 = (uint32_t)ce;
|
|
uint32_t firstHalf = getFirstHalf(p, lower32);
|
|
uint32_t secondHalf = getSecondHalf(p, lower32);
|
|
if (secondHalf != 0) {
|
|
if (offsets_->isEmpty()) {
|
|
// When we convert a single 64-bit CE into two 32-bit CEs,
|
|
// we need to make this artificial expansion behave like a normal expansion.
|
|
// See CollationIterator::previousCE().
|
|
offsets_->addElement(iter_->getOffset(), status);
|
|
offsets_->addElement(limitOffset, status);
|
|
}
|
|
otherHalf_ = firstHalf;
|
|
return secondHalf | 0xc0; // continuation CE
|
|
}
|
|
return firstHalf;
|
|
}
|
|
|
|
/**
|
|
* Resets the cursor to the beginning of the string.
|
|
*/
|
|
void CollationElementIterator::reset()
|
|
{
|
|
iter_ ->resetToOffset(0);
|
|
otherHalf_ = 0;
|
|
dir_ = 0;
|
|
}
|
|
|
|
void CollationElementIterator::setOffset(int32_t newOffset,
|
|
UErrorCode& status)
|
|
{
|
|
if (U_FAILURE(status)) { return; }
|
|
if (0 < newOffset && newOffset < string_.length()) {
|
|
int32_t offset = newOffset;
|
|
do {
|
|
UChar c = string_.charAt(offset);
|
|
if (!rbc_->isUnsafe(c) ||
|
|
(U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
|
|
break;
|
|
}
|
|
// Back up to before this unsafe character.
|
|
--offset;
|
|
} while (offset > 0);
|
|
if (offset < newOffset) {
|
|
// We might have backed up more than necessary.
|
|
// For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
|
|
// but for text "chu" setOffset(2) should remain at 2
|
|
// although we initially back up to offset 0.
|
|
// Find the last safe offset no greater than newOffset by iterating forward.
|
|
int32_t lastSafeOffset = offset;
|
|
do {
|
|
iter_->resetToOffset(lastSafeOffset);
|
|
do {
|
|
iter_->nextCE(status);
|
|
if (U_FAILURE(status)) { return; }
|
|
} while ((offset = iter_->getOffset()) == lastSafeOffset);
|
|
if (offset <= newOffset) {
|
|
lastSafeOffset = offset;
|
|
}
|
|
} while (offset < newOffset);
|
|
newOffset = lastSafeOffset;
|
|
}
|
|
}
|
|
iter_->resetToOffset(newOffset);
|
|
otherHalf_ = 0;
|
|
dir_ = 1;
|
|
}
|
|
|
|
/**
|
|
* Sets the source to the new source string.
|
|
*/
|
|
void CollationElementIterator::setText(const UnicodeString& source,
|
|
UErrorCode& status)
|
|
{
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
string_ = source;
|
|
const UChar *s = string_.getBuffer();
|
|
CollationIterator *newIter;
|
|
UBool numeric = rbc_->settings->isNumeric();
|
|
if (rbc_->settings->dontCheckFCD()) {
|
|
newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
|
|
} else {
|
|
newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
|
|
}
|
|
if (newIter == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
delete iter_;
|
|
iter_ = newIter;
|
|
otherHalf_ = 0;
|
|
dir_ = 0;
|
|
}
|
|
|
|
// Sets the source to the new character iterator.
|
|
void CollationElementIterator::setText(CharacterIterator& source,
|
|
UErrorCode& status)
|
|
{
|
|
if (U_FAILURE(status))
|
|
return;
|
|
|
|
source.getText(string_);
|
|
setText(string_, status);
|
|
}
|
|
|
|
int32_t CollationElementIterator::strengthOrder(int32_t order) const
|
|
{
|
|
UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
|
|
// Mask off the unwanted differences.
|
|
if (s == UCOL_PRIMARY) {
|
|
order &= 0xffff0000;
|
|
}
|
|
else if (s == UCOL_SECONDARY) {
|
|
order &= 0xffffff00;
|
|
}
|
|
|
|
return order;
|
|
}
|
|
|
|
/* CollationElementIterator private constructors/destructors --------------- */
|
|
|
|
/**
|
|
* This is the "real" constructor for this class; it constructs an iterator
|
|
* over the source text using the specified collator
|
|
*/
|
|
CollationElementIterator::CollationElementIterator(
|
|
const UnicodeString &source,
|
|
const RuleBasedCollator *coll,
|
|
UErrorCode &status)
|
|
: iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
|
|
setText(source, status);
|
|
}
|
|
|
|
/**
|
|
* This is the "real" constructor for this class; it constructs an iterator over
|
|
* the source text using the specified collator
|
|
*/
|
|
CollationElementIterator::CollationElementIterator(
|
|
const CharacterIterator &source,
|
|
const RuleBasedCollator *coll,
|
|
UErrorCode &status)
|
|
: iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
|
|
// We only call source.getText() which should be const anyway.
|
|
setText(const_cast<CharacterIterator &>(source), status);
|
|
}
|
|
|
|
/* CollationElementIterator private methods -------------------------------- */
|
|
|
|
const CollationElementIterator& CollationElementIterator::operator=(
|
|
const CollationElementIterator& other)
|
|
{
|
|
if (this == &other) {
|
|
return *this;
|
|
}
|
|
|
|
CollationIterator *newIter;
|
|
const FCDUTF16CollationIterator *otherFCDIter =
|
|
dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
|
|
if(otherFCDIter != NULL) {
|
|
newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
|
|
} else {
|
|
const UTF16CollationIterator *otherIter =
|
|
dynamic_cast<const UTF16CollationIterator *>(other.iter_);
|
|
if(otherIter != NULL) {
|
|
newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
|
|
} else {
|
|
newIter = NULL;
|
|
}
|
|
}
|
|
if(newIter != NULL) {
|
|
delete iter_;
|
|
iter_ = newIter;
|
|
rbc_ = other.rbc_;
|
|
otherHalf_ = other.otherHalf_;
|
|
dir_ = other.dir_;
|
|
|
|
string_ = other.string_;
|
|
}
|
|
if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
if(offsets_ == NULL) {
|
|
offsets_ = new UVector32(other.offsets_->size(), errorCode);
|
|
}
|
|
if(offsets_ != NULL) {
|
|
offsets_->assign(*other.offsets_, errorCode);
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
namespace {
|
|
|
|
class MaxExpSink : public ContractionsAndExpansions::CESink {
|
|
public:
|
|
MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
|
|
virtual ~MaxExpSink();
|
|
virtual void handleCE(int64_t /*ce*/) {}
|
|
virtual void handleExpansion(const int64_t ces[], int32_t length) {
|
|
if (length <= 1) {
|
|
// We do not need to add single CEs into the map.
|
|
return;
|
|
}
|
|
int32_t count = 0; // number of CE "halves"
|
|
for (int32_t i = 0; i < length; ++i) {
|
|
count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
|
|
}
|
|
// last "half" of the last CE
|
|
int64_t ce = ces[length - 1];
|
|
uint32_t p = (uint32_t)(ce >> 32);
|
|
uint32_t lower32 = (uint32_t)ce;
|
|
uint32_t lastHalf = getSecondHalf(p, lower32);
|
|
if (lastHalf == 0) {
|
|
lastHalf = getFirstHalf(p, lower32);
|
|
U_ASSERT(lastHalf != 0);
|
|
} else {
|
|
lastHalf |= 0xc0; // old-style continuation CE
|
|
}
|
|
if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
|
|
uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
|
|
}
|
|
}
|
|
|
|
private:
|
|
UHashtable *maxExpansions;
|
|
UErrorCode &errorCode;
|
|
};
|
|
|
|
MaxExpSink::~MaxExpSink() {}
|
|
|
|
} // namespace
|
|
|
|
UHashtable *
|
|
CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
|
|
uhash_compareLong, &errorCode);
|
|
if (U_FAILURE(errorCode)) { return NULL; }
|
|
MaxExpSink sink(maxExpansions, errorCode);
|
|
ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
|
|
if (U_FAILURE(errorCode)) {
|
|
uhash_close(maxExpansions);
|
|
return NULL;
|
|
}
|
|
return maxExpansions;
|
|
}
|
|
|
|
int32_t
|
|
CollationElementIterator::getMaxExpansion(int32_t order) const {
|
|
return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
|
|
}
|
|
|
|
int32_t
|
|
CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
|
|
if (order == 0) { return 1; }
|
|
int32_t max;
|
|
if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
|
|
return max;
|
|
}
|
|
if ((order & 0xc0) == 0xc0) {
|
|
// old-style continuation CE
|
|
return 2;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif /* #if !UCONFIG_NO_COLLATION */
|