182 lines
5.6 KiB
C++
182 lines
5.6 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2011-2013, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*******************************************************************************
|
|
* file name: ppucd.h
|
|
* encoding: UTF-8
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2011dec11
|
|
* created by: Markus W. Scherer
|
|
*/
|
|
|
|
#ifndef __PPUCD_H__
|
|
#define __PPUCD_H__
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/unistr.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
/** Additions to the uchar.h enum UProperty. */
|
|
enum {
|
|
/** Name_Alias */
|
|
PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
|
|
PPUCD_CONDITIONAL_CASE_MAPPINGS,
|
|
PPUCD_TURKIC_CASE_FOLDING
|
|
};
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
class U_TOOLUTIL_API PropertyNames {
|
|
public:
|
|
virtual ~PropertyNames();
|
|
virtual int32_t getPropertyEnum(const char *name) const;
|
|
virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
|
|
};
|
|
|
|
struct U_TOOLUTIL_API UniProps {
|
|
UniProps();
|
|
~UniProps();
|
|
|
|
int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
|
|
|
|
UChar32 start, end;
|
|
UBool binProps[UCHAR_BINARY_LIMIT];
|
|
int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
|
|
UVersionInfo age;
|
|
UChar32 bmg, bpb;
|
|
UChar32 scf, slc, stc, suc;
|
|
int32_t digitValue;
|
|
const char *numericValue;
|
|
const char *name;
|
|
const char *nameAlias;
|
|
UnicodeString cf, lc, tc, uc;
|
|
UnicodeSet scx;
|
|
};
|
|
|
|
class U_TOOLUTIL_API PreparsedUCD {
|
|
public:
|
|
enum LineType {
|
|
/** No line, end of file. */
|
|
NO_LINE,
|
|
/** Empty line. (Might contain a comment.) */
|
|
EMPTY_LINE,
|
|
|
|
/** ucd;6.1.0 */
|
|
UNICODE_VERSION_LINE,
|
|
|
|
/** property;Binary;Alpha;Alphabetic */
|
|
PROPERTY_LINE,
|
|
/** binary;N;No;F;False */
|
|
BINARY_LINE,
|
|
/** value;gc;Zs;Space_Separator */
|
|
VALUE_LINE,
|
|
|
|
/** defaults;0000..10FFFF;age=NA;bc=L;... */
|
|
DEFAULTS_LINE,
|
|
/** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
|
|
BLOCK_LINE,
|
|
/** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
|
|
CP_LINE,
|
|
/** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
|
|
UNASSIGNED_LINE,
|
|
|
|
/** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
|
|
ALG_NAMES_RANGE_LINE,
|
|
|
|
LINE_TYPE_COUNT
|
|
};
|
|
|
|
/**
|
|
* Constructor.
|
|
* Prepare this object for a new, empty package.
|
|
*/
|
|
PreparsedUCD(const char *filename, UErrorCode &errorCode);
|
|
|
|
/** Destructor. */
|
|
~PreparsedUCD();
|
|
|
|
/** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
|
|
void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
|
|
|
|
/**
|
|
* Reads a line from the preparsed UCD file.
|
|
* Splits the line by replacing each ';' with a NUL.
|
|
*/
|
|
LineType readLine(UErrorCode &errorCode);
|
|
|
|
/** Returns the number of the line read by readLine(). */
|
|
int32_t getLineNumber() const { return lineNumber; }
|
|
|
|
/** Returns the line's next field, or NULL. */
|
|
const char *nextField();
|
|
|
|
/** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
|
|
const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
|
|
|
|
/** Returns TRUE if the current line has property values. */
|
|
UBool lineHasPropertyValues() const {
|
|
return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
|
|
}
|
|
|
|
/**
|
|
* Parses properties from the current line.
|
|
* Clears newValues and sets UProperty codes for property values mentioned
|
|
* on the current line (as opposed to being inherited).
|
|
* Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
|
|
* The returned UniProps are usable until the next line of the same type is read.
|
|
*/
|
|
const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
|
|
|
|
/**
|
|
* Returns the code point range for the current algnamesrange line.
|
|
* Calls & parses nextField().
|
|
* Further nextField() calls will yield the range's type & prefix string.
|
|
* Returns U_SUCCESS(errorCode).
|
|
*/
|
|
UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
|
|
|
|
private:
|
|
UBool isLineBufferAvailable(int32_t i) {
|
|
return defaultLineIndex!=i && blockLineIndex!=i;
|
|
}
|
|
|
|
/** Resets the field iterator and returns the line's first field (the line type field). */
|
|
const char *firstField();
|
|
|
|
UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
|
|
UErrorCode &errorCode);
|
|
UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
|
|
UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
|
|
void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
|
|
void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
|
|
|
|
static const int32_t kNumLineBuffers=3;
|
|
|
|
PropertyNames *icuPnames; // owned
|
|
const PropertyNames *pnames; // aliased
|
|
FILE *file;
|
|
int32_t defaultLineIndex, blockLineIndex, lineIndex;
|
|
int32_t lineNumber;
|
|
LineType lineType;
|
|
char *fieldLimit;
|
|
char *lineLimit;
|
|
|
|
UVersionInfo ucdVersion;
|
|
UniProps defaultProps, blockProps, cpProps;
|
|
UnicodeSet blockValues;
|
|
// Multiple lines so that default and block properties can maintain pointers
|
|
// into their line buffers.
|
|
char lines[kNumLineBuffers][4096];
|
|
};
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif // __PPUCD_H__
|