icu/source/i18n/ucol.cpp

621 lines
19 KiB
C++

// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucol.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* Modification history
* Date Name Comments
* 1996-1999 various members of ICU team maintained C API for collation framework
* 02/16/2001 synwee Added internal method getPrevSpecialCE
* 03/01/2001 synwee Added maxexpansion functionality.
* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
* 2012-2014 markus Rewritten in C++ again.
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/coll.h"
#include "unicode/tblcoll.h"
#include "unicode/bytestream.h"
#include "unicode/coleitr.h"
#include "unicode/ucoleitr.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "collation.h"
#include "cstring.h"
#include "putilimp.h"
#include "uassert.h"
#include "utracimp.h"
U_NAMESPACE_USE
U_CAPI UCollator* U_EXPORT2
ucol_openBinary(const uint8_t *bin, int32_t length,
const UCollator *base,
UErrorCode *status)
{
if(U_FAILURE(*status)) { return NULL; }
RuleBasedCollator *coll = new RuleBasedCollator(
bin, length,
RuleBasedCollator::rbcFromUCollator(base),
*status);
if(coll == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
if(U_FAILURE(*status)) {
delete coll;
return NULL;
}
return coll->toUCollator();
}
U_CAPI int32_t U_EXPORT2
ucol_cloneBinary(const UCollator *coll,
uint8_t *buffer, int32_t capacity,
UErrorCode *status)
{
if(U_FAILURE(*status)) {
return 0;
}
const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
if(rbc == NULL && coll != NULL) {
*status = U_UNSUPPORTED_ERROR;
return 0;
}
return rbc->cloneBinary(buffer, capacity, *status);
}
U_CAPI UCollator* U_EXPORT2
ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
{
if (status == NULL || U_FAILURE(*status)){
return NULL;
}
if (coll == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if (pBufferSize != NULL) {
int32_t inputSize = *pBufferSize;
*pBufferSize = 1;
if (inputSize == 0) {
return NULL; // preflighting for deprecated functionality
}
}
Collator *newColl = Collator::fromUCollator(coll)->clone();
if (newColl == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
} else {
*status = U_SAFECLONE_ALLOCATED_WARNING;
}
return newColl->toUCollator();
}
U_CAPI void U_EXPORT2
ucol_close(UCollator *coll)
{
UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
if(coll != NULL) {
delete Collator::fromUCollator(coll);
}
UTRACE_EXIT();
}
U_CAPI int32_t U_EXPORT2
ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
const uint8_t *src2, int32_t src2Length,
uint8_t *dest, int32_t destCapacity) {
/* check arguments */
if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
destCapacity<0 || (destCapacity>0 && dest==NULL)
) {
/* error, attempt to write a zero byte and return 0 */
if(dest!=NULL && destCapacity>0) {
*dest=0;
}
return 0;
}
/* check lengths and capacity */
if(src1Length<0) {
src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
}
if(src2Length<0) {
src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
}
int32_t destLength=src1Length+src2Length;
if(destLength>destCapacity) {
/* the merged sort key does not fit into the destination */
return destLength;
}
/* merge the sort keys with the same number of levels */
uint8_t *p=dest;
for(;;) {
/* copy level from src1 not including 00 or 01 */
uint8_t b;
while((b=*src1)>=2) {
++src1;
*p++=b;
}
/* add a 02 merge separator */
*p++=2;
/* copy level from src2 not including 00 or 01 */
while((b=*src2)>=2) {
++src2;
*p++=b;
}
/* if both sort keys have another level, then add a 01 level separator and continue */
if(*src1==1 && *src2==1) {
++src1;
++src2;
*p++=1;
} else {
break;
}
}
/*
* here, at least one sort key is finished now, but the other one
* might have some contents left from containing more levels;
* that contents is just appended to the result
*/
if(*src1!=0) {
/* src1 is not finished, therefore *src2==0, and src1 is appended */
src2=src1;
}
/* append src2, "the other, unfinished sort key" */
while((*p++=*src2++)!=0) {}
/* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
return (int32_t)(p-dest);
}
U_CAPI int32_t U_EXPORT2
ucol_getSortKey(const UCollator *coll,
const UChar *source,
int32_t sourceLength,
uint8_t *result,
int32_t resultLength)
{
UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
}
int32_t keySize = Collator::fromUCollator(coll)->
getSortKey(source, sourceLength, result, resultLength);
UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
UTRACE_EXIT_VALUE(keySize);
return keySize;
}
U_CAPI int32_t U_EXPORT2
ucol_nextSortKeyPart(const UCollator *coll,
UCharIterator *iter,
uint32_t state[2],
uint8_t *dest, int32_t count,
UErrorCode *status)
{
/* error checking */
if(status==NULL || U_FAILURE(*status)) {
return 0;
}
UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
coll, iter, state[0], state[1], dest, count);
int32_t i = Collator::fromUCollator(coll)->
internalNextSortKeyPart(iter, state, dest, count, *status);
// Return number of meaningful sortkey bytes.
UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
dest,i, state[0], state[1]);
UTRACE_EXIT_VALUE_STATUS(i, *status);
return i;
}
/**
* Produce a bound for a given sortkey and a number of levels.
*/
U_CAPI int32_t U_EXPORT2
ucol_getBound(const uint8_t *source,
int32_t sourceLength,
UColBoundMode boundType,
uint32_t noOfLevels,
uint8_t *result,
int32_t resultLength,
UErrorCode *status)
{
// consistency checks
if(status == NULL || U_FAILURE(*status)) {
return 0;
}
if(source == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t sourceIndex = 0;
// Scan the string until we skip enough of the key OR reach the end of the key
do {
sourceIndex++;
if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) {
noOfLevels--;
}
} while (noOfLevels > 0
&& (source[sourceIndex] != 0 || sourceIndex < sourceLength));
if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
&& noOfLevels > 0) {
*status = U_SORT_KEY_TOO_SHORT_WARNING;
}
// READ ME: this code assumes that the values for boundType
// enum will not changes. They are set so that the enum value
// corresponds to the number of extra bytes each bound type
// needs.
if(result != NULL && resultLength >= sourceIndex+boundType) {
uprv_memcpy(result, source, sourceIndex);
switch(boundType) {
// Lower bound just gets terminated. No extra bytes
case UCOL_BOUND_LOWER: // = 0
break;
// Upper bound needs one extra byte
case UCOL_BOUND_UPPER: // = 1
result[sourceIndex++] = 2;
break;
// Upper long bound needs two extra bytes
case UCOL_BOUND_UPPER_LONG: // = 2
result[sourceIndex++] = 0xFF;
result[sourceIndex++] = 0xFF;
break;
default:
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
result[sourceIndex++] = 0;
return sourceIndex;
} else {
return sourceIndex+boundType+1;
}
}
U_CAPI void U_EXPORT2
ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) { return; }
Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode);
}
U_CAPI UColReorderCode U_EXPORT2
ucol_getMaxVariable(const UCollator *coll) {
return Collator::fromUCollator(coll)->getMaxVariable();
}
U_CAPI uint32_t U_EXPORT2
ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
if(U_FAILURE(*status) || coll == NULL) {
return 0;
}
return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status);
}
U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
if(U_FAILURE(*status) || coll == NULL) {
return 0;
}
return Collator::fromUCollator(coll)->getVariableTop(*status);
}
U_CAPI void U_EXPORT2
ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
if(U_FAILURE(*status) || coll == NULL) {
return;
}
Collator::fromUCollator(coll)->setVariableTop(varTop, *status);
}
U_CAPI void U_EXPORT2
ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
if(U_FAILURE(*status) || coll == NULL) {
return;
}
Collator::fromUCollator(coll)->setAttribute(attr, value, *status);
}
U_CAPI UColAttributeValue U_EXPORT2
ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
if(U_FAILURE(*status) || coll == NULL) {
return UCOL_DEFAULT;
}
return Collator::fromUCollator(coll)->getAttribute(attr, *status);
}
U_CAPI void U_EXPORT2
ucol_setStrength( UCollator *coll,
UCollationStrength strength)
{
UErrorCode status = U_ZERO_ERROR;
ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
}
U_CAPI UCollationStrength U_EXPORT2
ucol_getStrength(const UCollator *coll)
{
UErrorCode status = U_ZERO_ERROR;
return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
}
U_CAPI int32_t U_EXPORT2
ucol_getReorderCodes(const UCollator *coll,
int32_t *dest,
int32_t destCapacity,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return 0;
}
return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *status);
}
U_CAPI void U_EXPORT2
ucol_setReorderCodes(UCollator* coll,
const int32_t* reorderCodes,
int32_t reorderCodesLength,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return;
}
Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
}
U_CAPI int32_t U_EXPORT2
ucol_getEquivalentReorderCodes(int32_t reorderCode,
int32_t* dest,
int32_t destCapacity,
UErrorCode *pErrorCode) {
return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode);
}
U_CAPI void U_EXPORT2
ucol_getVersion(const UCollator* coll,
UVersionInfo versionInfo)
{
Collator::fromUCollator(coll)->getVersion(versionInfo);
}
U_CAPI UCollationResult U_EXPORT2
ucol_strcollIter( const UCollator *coll,
UCharIterator *sIter,
UCharIterator *tIter,
UErrorCode *status)
{
if(!status || U_FAILURE(*status)) {
return UCOL_EQUAL;
}
UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
if(sIter == NULL || tIter == NULL || coll == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
return UCOL_EQUAL;
}
UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tIter, *status);
UTRACE_EXIT_VALUE_STATUS(result, *status);
return result;
}
/* */
/* ucol_strcoll Main public API string comparison function */
/* */
U_CAPI UCollationResult U_EXPORT2
ucol_strcoll( const UCollator *coll,
const UChar *source,
int32_t sourceLength,
const UChar *target,
int32_t targetLength)
{
UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
}
UErrorCode status = U_ZERO_ERROR;
UCollationResult returnVal = Collator::fromUCollator(coll)->
compare(source, sourceLength, target, targetLength, status);
UTRACE_EXIT_VALUE_STATUS(returnVal, status);
return returnVal;
}
U_CAPI UCollationResult U_EXPORT2
ucol_strcollUTF8(
const UCollator *coll,
const char *source,
int32_t sourceLength,
const char *target,
int32_t targetLength,
UErrorCode *status)
{
UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
}
if (U_FAILURE(*status)) {
/* do nothing */
UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
return UCOL_EQUAL;
}
UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareUTF8(
source, sourceLength, target, targetLength, *status);
UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
return returnVal;
}
/* convenience function for comparing strings */
U_CAPI UBool U_EXPORT2
ucol_greater( const UCollator *coll,
const UChar *source,
int32_t sourceLength,
const UChar *target,
int32_t targetLength)
{
return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
== UCOL_GREATER);
}
/* convenience function for comparing strings */
U_CAPI UBool U_EXPORT2
ucol_greaterOrEqual( const UCollator *coll,
const UChar *source,
int32_t sourceLength,
const UChar *target,
int32_t targetLength)
{
return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
!= UCOL_LESS);
}
/* convenience function for comparing strings */
U_CAPI UBool U_EXPORT2
ucol_equal( const UCollator *coll,
const UChar *source,
int32_t sourceLength,
const UChar *target,
int32_t targetLength)
{
return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
== UCOL_EQUAL);
}
U_CAPI void U_EXPORT2
ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
const Collator *c = Collator::fromUCollator(coll);
if(c != NULL) {
UVersionInfo v;
c->getVersion(v);
// Note: This is tied to how the current implementation encodes the UCA version
// in the overall getVersion().
// Alternatively, we could load the root collator and get at lower-level data from there.
// Either way, it will reflect the input collator's UCA version only
// if it is a known implementation.
// It would be cleaner to make this a virtual Collator method.
info[0] = v[1] >> 3;
info[1] = v[1] & 7;
info[2] = v[2] >> 6;
info[3] = 0;
}
}
U_CAPI const UChar * U_EXPORT2
ucol_getRules(const UCollator *coll, int32_t *length) {
const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
// OK to crash if coll==NULL: We do not want to check "this" pointers.
if(rbc != NULL || coll == NULL) {
const UnicodeString &rules = rbc->getRules();
U_ASSERT(rules.getBuffer()[rules.length()] == 0);
*length = rules.length();
return rules.getBuffer();
}
static const UChar _NUL = 0;
*length = 0;
return &_NUL;
}
U_CAPI int32_t U_EXPORT2
ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
UnicodeString rules;
const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
if(rbc != NULL || coll == NULL) {
rbc->getRules(delta, rules);
}
if(buffer != NULL && bufferLen > 0) {
UErrorCode errorCode = U_ZERO_ERROR;
return rules.extract(buffer, bufferLen, errorCode);
} else {
return rules.length();
}
}
U_CAPI const char * U_EXPORT2
ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
return ucol_getLocaleByType(coll, type, status);
}
U_CAPI const char * U_EXPORT2
ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
if(U_FAILURE(*status)) {
return NULL;
}
UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);
UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);
const char *result;
const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
if(rbc == NULL && coll != NULL) {
*status = U_UNSUPPORTED_ERROR;
result = NULL;
} else {
result = rbc->internalGetLocaleID(type, *status);
}
UTRACE_DATA1(UTRACE_INFO, "result = %s", result);
UTRACE_EXIT_STATUS(*status);
return result;
}
U_CAPI USet * U_EXPORT2
ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) {
if(U_FAILURE(*status)) {
return NULL;
}
UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status);
if(U_FAILURE(*status)) {
delete set;
return NULL;
}
return set->toUSet();
}
U_CAPI UBool U_EXPORT2
ucol_equals(const UCollator *source, const UCollator *target) {
return source == target ||
(*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target));
}
#endif /* #if !UCONFIG_NO_COLLATION */