mirror of
https://github.com/Tencent/rapidjson.git
synced 2025-03-09 19:24:23 +01:00
Optimization for Regex and Schema
This commit is contained in:
parent
a006648398
commit
a33af83ee4
@ -71,13 +71,17 @@ class GenericRegex {
|
|||||||
public:
|
public:
|
||||||
typedef typename Encoding::Ch Ch;
|
typedef typename Encoding::Ch Ch;
|
||||||
|
|
||||||
GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), anchorBegin_(), anchorEnd_() {
|
GenericRegex(const Ch* source, Allocator* allocator = 0) :
|
||||||
|
states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
|
||||||
|
stateSet_(), state0_(allocator, 0), state1_(allocator, 0), anchorBegin_(), anchorEnd_()
|
||||||
|
{
|
||||||
GenericStringStream<Encoding> ss(source);
|
GenericStringStream<Encoding> ss(source);
|
||||||
DecodedStream<GenericStringStream<Encoding> > ds(ss);
|
DecodedStream<GenericStringStream<Encoding> > ds(ss);
|
||||||
Parse(ds);
|
Parse(ds);
|
||||||
}
|
}
|
||||||
|
|
||||||
~GenericRegex() {
|
~GenericRegex() {
|
||||||
|
Allocator::Free(stateSet_);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IsValid() const {
|
bool IsValid() const {
|
||||||
@ -308,6 +312,14 @@ private:
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Preallocate buffer for SearchWithAnchoring()
|
||||||
|
RAPIDJSON_ASSERT(stateSet_ == 0);
|
||||||
|
if (stateCount_ > 0) {
|
||||||
|
stateSet_ = static_cast<unsigned*>(states_.GetAllocator().Malloc(GetStateSetSize()));
|
||||||
|
state0_.Reserve<SizeType>(stateCount_);
|
||||||
|
state1_.Reserve<SizeType>(stateCount_);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
|
SizeType NewState(SizeType out, SizeType out1, unsigned codepoint) {
|
||||||
@ -568,21 +580,15 @@ private:
|
|||||||
RAPIDJSON_ASSERT(IsValid());
|
RAPIDJSON_ASSERT(IsValid());
|
||||||
DecodedStream<InputStream> ds(is);
|
DecodedStream<InputStream> ds(is);
|
||||||
|
|
||||||
Allocator allocator;
|
state0_.Clear();
|
||||||
Stack<Allocator> state0(&allocator, stateCount_ * sizeof(SizeType));
|
Stack<Allocator> *current = &state0_, *next = &state1_;
|
||||||
Stack<Allocator> state1(&allocator, stateCount_ * sizeof(SizeType));
|
const size_t stateSetSize = GetStateSetSize();
|
||||||
Stack<Allocator> *current = &state0, *next = &state1;
|
std::memset(stateSet_, 0, stateSetSize);
|
||||||
|
|
||||||
const size_t stateSetSize = (stateCount_ + 31) / 32 * 4;
|
|
||||||
unsigned* stateSet = static_cast<unsigned*>(allocator.Malloc(stateSetSize));
|
|
||||||
std::memset(stateSet, 0, stateSetSize);
|
|
||||||
|
|
||||||
bool matched = false;
|
|
||||||
matched = AddState(stateSet, *current, root_);
|
|
||||||
|
|
||||||
|
bool matched = AddState(*current, root_);
|
||||||
unsigned codepoint;
|
unsigned codepoint;
|
||||||
while (!current->Empty() && (codepoint = ds.Take()) != 0) {
|
while (!current->Empty() && (codepoint = ds.Take()) != 0) {
|
||||||
std::memset(stateSet, 0, stateSetSize);
|
std::memset(stateSet_, 0, stateSetSize);
|
||||||
next->Clear();
|
next->Clear();
|
||||||
matched = false;
|
matched = false;
|
||||||
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
|
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
|
||||||
@ -591,39 +597,38 @@ private:
|
|||||||
sr.codepoint == kAnyCharacterClass ||
|
sr.codepoint == kAnyCharacterClass ||
|
||||||
(sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
|
(sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
|
||||||
{
|
{
|
||||||
matched = AddState(stateSet, *next, sr.out) || matched;
|
matched = AddState(*next, sr.out) || matched;
|
||||||
if (!anchorEnd && matched)
|
if (!anchorEnd && matched)
|
||||||
goto exit;
|
return true;
|
||||||
}
|
}
|
||||||
if (!anchorBegin)
|
if (!anchorBegin)
|
||||||
AddState(stateSet, *next, root_);
|
AddState(*next, root_);
|
||||||
}
|
}
|
||||||
Stack<Allocator>* temp = current;
|
internal::Swap(current, next);
|
||||||
current = next;
|
|
||||||
next = temp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
exit:
|
|
||||||
Allocator::Free(stateSet);
|
|
||||||
return matched;
|
return matched;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t GetStateSetSize() const {
|
||||||
|
return (stateCount_ + 31) / 32 * 4;
|
||||||
|
}
|
||||||
|
|
||||||
// Return whether the added states is a match state
|
// Return whether the added states is a match state
|
||||||
bool AddState(unsigned* stateSet, Stack<Allocator>& l, SizeType index) const {
|
bool AddState(Stack<Allocator>& l, SizeType index) const {
|
||||||
if (index == kRegexInvalidState)
|
if (index == kRegexInvalidState)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
const State& s = GetState(index);
|
const State& s = GetState(index);
|
||||||
if (s.out1 != kRegexInvalidState) { // Split
|
if (s.out1 != kRegexInvalidState) { // Split
|
||||||
bool matched = AddState(stateSet, l, s.out);
|
bool matched = AddState(l, s.out);
|
||||||
matched = AddState(stateSet, l, s.out1) || matched;
|
return AddState(l, s.out1) || matched;
|
||||||
return matched;
|
|
||||||
}
|
}
|
||||||
else if (!(stateSet[index >> 5] & (1 << (index & 31)))) {
|
else if (!(stateSet_[index >> 5] & (1 << (index & 31)))) {
|
||||||
stateSet[index >> 5] |= (1 << (index & 31));
|
stateSet_[index >> 5] |= (1 << (index & 31));
|
||||||
*l.template Push<SizeType>() = index;
|
*l.template PushUnsafe<SizeType>() = index;
|
||||||
}
|
}
|
||||||
return GetState(index).out == kRegexInvalidState;
|
return s.out == kRegexInvalidState; // by using PushUnsafe() above, we can ensure s is not validated due to reallocation.
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
|
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
|
||||||
@ -642,6 +647,11 @@ private:
|
|||||||
SizeType root_;
|
SizeType root_;
|
||||||
SizeType stateCount_;
|
SizeType stateCount_;
|
||||||
SizeType rangeCount_;
|
SizeType rangeCount_;
|
||||||
|
|
||||||
|
// For SearchWithAnchoring()
|
||||||
|
uint32_t* stateSet_; // allocated by states_.GetAllocator()
|
||||||
|
mutable Stack<Allocator> state0_;
|
||||||
|
mutable Stack<Allocator> state1_;
|
||||||
bool anchorBegin_;
|
bool anchorBegin_;
|
||||||
bool anchorEnd_;
|
bool anchorEnd_;
|
||||||
};
|
};
|
||||||
|
@ -38,7 +38,6 @@ public:
|
|||||||
// Optimization note: Do not allocate memory for stack_ in constructor.
|
// Optimization note: Do not allocate memory for stack_ in constructor.
|
||||||
// Do it lazily when first Push() -> Expand() -> Resize().
|
// Do it lazily when first Push() -> Expand() -> Resize().
|
||||||
Stack(Allocator* allocator, size_t stackCapacity) : allocator_(allocator), ownAllocator_(0), stack_(0), stackTop_(0), stackEnd_(0), initialCapacity_(stackCapacity) {
|
Stack(Allocator* allocator, size_t stackCapacity) : allocator_(allocator), ownAllocator_(0), stack_(0), stackTop_(0), stackEnd_(0), initialCapacity_(stackCapacity) {
|
||||||
RAPIDJSON_ASSERT(stackCapacity > 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
|
#if RAPIDJSON_HAS_CXX11_RVALUE_REFS
|
||||||
|
@ -300,15 +300,17 @@ struct SchemaValidationContext {
|
|||||||
factory.DestroySchemaValidator(patternPropertiesValidators[i]);
|
factory.DestroySchemaValidator(patternPropertiesValidators[i]);
|
||||||
factory.FreeState(patternPropertiesValidators);
|
factory.FreeState(patternPropertiesValidators);
|
||||||
}
|
}
|
||||||
factory.FreeState(patternPropertiesSchemas);
|
if (patternPropertiesSchemas)
|
||||||
factory.FreeState(objectDependencies);
|
factory.FreeState(patternPropertiesSchemas);
|
||||||
|
if (objectDependencies)
|
||||||
|
factory.FreeState(objectDependencies);
|
||||||
}
|
}
|
||||||
|
|
||||||
SchemaValidatorFactoryType& factory;
|
SchemaValidatorFactoryType& factory;
|
||||||
const SchemaType* schema;
|
const SchemaType* schema;
|
||||||
const SchemaType* valueSchema;
|
const SchemaType* valueSchema;
|
||||||
const Ch* invalidKeyword;
|
const Ch* invalidKeyword;
|
||||||
void* hasher; // Only calidator access
|
void* hasher; // Only validator access
|
||||||
void* arrayElementHashCodes; // Only validator access this
|
void* arrayElementHashCodes; // Only validator access this
|
||||||
ISchemaValidator** validators;
|
ISchemaValidator** validators;
|
||||||
SizeType validatorCount;
|
SizeType validatorCount;
|
||||||
@ -613,7 +615,7 @@ public:
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool EndValue(Context& context) const {
|
RAPIDJSON_FORCEINLINE bool EndValue(Context& context) const {
|
||||||
if (context.patternPropertiesValidatorCount > 0) {
|
if (context.patternPropertiesValidatorCount > 0) {
|
||||||
bool otherValid = false;
|
bool otherValid = false;
|
||||||
SizeType count = context.patternPropertiesValidatorCount;
|
SizeType count = context.patternPropertiesValidatorCount;
|
||||||
@ -1080,8 +1082,12 @@ private:
|
|||||||
// O(n)
|
// O(n)
|
||||||
template <typename ValueType>
|
template <typename ValueType>
|
||||||
bool FindPropertyIndex(const ValueType& name, SizeType* outIndex) const {
|
bool FindPropertyIndex(const ValueType& name, SizeType* outIndex) const {
|
||||||
|
SizeType len = name.GetStringLength();
|
||||||
|
const Ch* str = name.GetString();
|
||||||
for (SizeType index = 0; index < propertyCount_; index++)
|
for (SizeType index = 0; index < propertyCount_; index++)
|
||||||
if (properties_[index].name == name) {
|
if (properties_[index].name.GetStringLength() == len &&
|
||||||
|
(std::memcmp(properties_[index].name.GetString(), str, sizeof(Ch) * len) == 0))
|
||||||
|
{
|
||||||
*outIndex = index;
|
*outIndex = index;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -1703,7 +1709,7 @@ private:
|
|||||||
PushSchema(root_);
|
PushSchema(root_);
|
||||||
else {
|
else {
|
||||||
if (CurrentContext().inArray)
|
if (CurrentContext().inArray)
|
||||||
AppendToken(CurrentContext().arrayElementIndex);
|
AppendToken<Ch>(CurrentContext().arrayElementIndex);
|
||||||
|
|
||||||
if (!CurrentSchema().BeginValue(CurrentContext()))
|
if (!CurrentSchema().BeginValue(CurrentContext()))
|
||||||
return false;
|
return false;
|
||||||
@ -1767,21 +1773,23 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AppendToken(const Ch* str, SizeType len) {
|
void AppendToken(const Ch* str, SizeType len) {
|
||||||
*documentStack_.template Push<Ch>() = '/';
|
documentStack_.template Reserve<Ch>(1 + len * 2); // worst case all characters are escaped as two characters
|
||||||
|
*documentStack_.template PushUnsafe<Ch>() = '/';
|
||||||
for (SizeType i = 0; i < len; i++) {
|
for (SizeType i = 0; i < len; i++) {
|
||||||
if (str[i] == '~') {
|
if (str[i] == '~') {
|
||||||
*documentStack_.template Push<Ch>() = '~';
|
*documentStack_.template PushUnsafe<Ch>() = '~';
|
||||||
*documentStack_.template Push<Ch>() = '0';
|
*documentStack_.template PushUnsafe<Ch>() = '0';
|
||||||
}
|
}
|
||||||
else if (str[i] == '/') {
|
else if (str[i] == '/') {
|
||||||
*documentStack_.template Push<Ch>() = '~';
|
*documentStack_.template PushUnsafe<Ch>() = '~';
|
||||||
*documentStack_.template Push<Ch>() = '1';
|
*documentStack_.template PushUnsafe<Ch>() = '1';
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
*documentStack_.template Push<Ch>() = str[i];
|
*documentStack_.template PushUnsafe<Ch>() = str[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Ch>
|
||||||
void AppendToken(SizeType index) {
|
void AppendToken(SizeType index) {
|
||||||
*documentStack_.template Push<Ch>() = '/';
|
*documentStack_.template Push<Ch>() = '/';
|
||||||
char buffer[21];
|
char buffer[21];
|
||||||
@ -1790,9 +1798,27 @@ private:
|
|||||||
*documentStack_.template Push<Ch>() = buffer[i];
|
*documentStack_.template Push<Ch>() = buffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push<Context>()) Context(*this, &schema); }
|
// Specialized version for char to prevent buffer copying.
|
||||||
|
template <>
|
||||||
|
void AppendToken<char>(SizeType index) {
|
||||||
|
if (sizeof(SizeType) == 4) {
|
||||||
|
char *buffer = documentStack_.template Push<Ch>(1 + 10); // '/' + uint
|
||||||
|
*buffer++ = '/';
|
||||||
|
const char* end = internal::u32toa(index, buffer);
|
||||||
|
documentStack_.template Pop<Ch>(static_cast<size_t>(10 - (end - buffer)));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
char *buffer = documentStack_.template Push<Ch>(1 + 20); // '/' + uint64
|
||||||
|
*buffer++ = '/';
|
||||||
|
const char* end = internal::u64toa(index, buffer);
|
||||||
|
documentStack_.template Pop<Ch>(static_cast<size_t>(20 - (end - buffer)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
RAPIDJSON_FORCEINLINE void PushSchema(const SchemaType& schema) { new (schemaStack_.template Push<Context>()) Context(*this, &schema); }
|
||||||
|
|
||||||
void PopSchema() {
|
RAPIDJSON_FORCEINLINE void PopSchema() {
|
||||||
Context* c = schemaStack_.template Pop<Context>(1);
|
Context* c = schemaStack_.template Pop<Context>(1);
|
||||||
if (HashCodeArray* a = static_cast<HashCodeArray*>(c->arrayElementHashCodes)) {
|
if (HashCodeArray* a = static_cast<HashCodeArray*>(c->arrayElementHashCodes)) {
|
||||||
a->~HashCodeArray();
|
a->~HashCodeArray();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user