From 2431bdeaa6699379cae41f51e997bfa5345a1813 Mon Sep 17 00:00:00 2001 From: sandwoodK Date: Tue, 30 Apr 2024 11:35:45 +0200 Subject: [PATCH] Ability to customize regular expression engine --- CMakeLists.txt | 1 + README.md | 28 ++++ include/valijson/validation_visitor.hpp | 38 +++--- include/valijson/validator.hpp | 35 ++++- ..._with_custom_regular_expression_engine.cpp | 128 ++++++++++++++++++ 5 files changed, 205 insertions(+), 25 deletions(-) create mode 100644 tests/test_validator_with_custom_regular_expression_engine.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 67c35a8..fb0ffa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,6 +124,7 @@ if(valijson_BUILD_TESTS) tests/test_poly_constraint.cpp tests/test_validation_errors.cpp tests/test_validator.cpp + tests/test_validator_with_custom_regular_expression_engine.cpp tests/test_yaml_cpp_adapter.cpp ) diff --git a/README.md b/README.md index b77d341..88e845f 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,34 @@ Validator validator(Validator::kWeakTypes); This will create a validator that will attempt to cast values to satisfy a schema. The original motivation for this was to support the Boost Property Tree library, which can parse JSON, but stores values as strings. +## Regular Expression Engine + +When enforcing a 'pattern' property, a regular expression engine is in used. By default, the DefaultRegexEngine use std::regex. +std::regex has no protection against catastrophic backtracking and implementation with gcc is so suboptimal that it can easily leads to segmentation fault. +One can customise the regular expression engine by implementing it's own wrapper to it and using a ValidatorT with the custom type. + +The regular expression engine wrapper must implement the following interface +```cpp +struct MyRegexpEngine +{ + MyRegexpEngine(const std::string& pattern) + { + //implementation specific + } + + static bool search(const std::string& s, const MyRegexpEngine& r) + { + //implementation specific + } +}; + +``` + +Then to use it +```cpp + using MyValidator = ValidatorT; +``` + ## Memory Management Valijson has been designed to safely manage, and eventually free, the memory that is allocated while parsing a schema or validating a document. When working with an externally loaded schema (i.e. one that is populated using the `SchemaParser` class) you can rely on RAII semantics. diff --git a/include/valijson/validation_visitor.hpp b/include/valijson/validation_visitor.hpp index 45ff105..98a4070 100644 --- a/include/valijson/validation_visitor.hpp +++ b/include/valijson/validation_visitor.hpp @@ -28,7 +28,7 @@ class ValidationResults; * * @tparam AdapterType Adapter type for the target document. */ -template +template class ValidationVisitor: public constraints::ConstraintVisitor { public: @@ -44,14 +44,14 @@ public: * recording error descriptions. If this pointer is set * to nullptr, validation errors will caused validation to * stop immediately. - * @param regexesCache Cache of already created std::regex objects for pattern + * @param regexesCache Cache of already created RegexEngine objects for pattern * constraints. */ ValidationVisitor(const AdapterType &target, std::vector context, const bool strictTypes, ValidationResults *results, - std::unordered_map& regexesCache) + std::unordered_map& regexesCache) : m_target(target), m_context(std::move(context)), m_results(results), @@ -155,7 +155,7 @@ public: ValidationResults newResults; ValidationResults *childResults = (m_results) ? &newResults : nullptr; - ValidationVisitor v(m_target, m_context, m_strictTypes, childResults, m_regexesCache); + ValidationVisitor v(m_target, m_context, m_strictTypes, childResults, m_regexesCache); constraint.applyToSubschemas( ValidateSubschemas(m_target, m_context, false, true, v, childResults, &numValidated, nullptr)); @@ -498,7 +498,7 @@ public: std::vector newContext = m_context; newContext.push_back("[" + std::to_string(index) + "]"); - ValidationVisitor validator(*itr, newContext, m_strictTypes, m_results, m_regexesCache); + ValidationVisitor validator(*itr, newContext, m_strictTypes, m_results, m_regexesCache); if (!validator.validateSchema(*additionalItemsSubschema)) { if (m_results) { @@ -874,7 +874,7 @@ public: return false; } - ValidationVisitor v(m_target, m_context, m_strictTypes, nullptr, m_regexesCache); + ValidationVisitor v(m_target, m_context, m_strictTypes, nullptr, m_regexesCache); if (v.validateSchema(*subschema)) { if (m_results) { m_results->pushError(m_context, @@ -901,7 +901,7 @@ public: ValidationResults newResults; ValidationResults *childResults = (m_results) ? &newResults : nullptr; - ValidationVisitor v(m_target, m_context, m_strictTypes, childResults, m_regexesCache); + ValidationVisitor v(m_target, m_context, m_strictTypes, childResults, m_regexesCache); constraint.applyToSubschemas( ValidateSubschemas(m_target, m_context, true, true, v, childResults, &numValidated, nullptr)); @@ -943,10 +943,10 @@ public: std::string pattern(constraint.getPattern()); auto it = m_regexesCache.find(pattern); if (it == m_regexesCache.end()) { - it = m_regexesCache.emplace(pattern, std::regex(pattern)).first; + it = m_regexesCache.emplace(pattern, RegexEngine(pattern)).first; } - if (!std::regex_search(m_target.asString(), it->second)) { + if (!RegexEngine::search(m_target.asString(), it->second)) { if (m_results) { m_results->pushError(m_context, "Failed to match regex specified by 'pattern' constraint."); } @@ -1086,7 +1086,7 @@ public: for (const typename AdapterType::ObjectMember m : m_target.asObject()) { adapters::StdStringAdapter stringAdapter(m.first); - ValidationVisitor validator(stringAdapter, m_context, m_strictTypes, nullptr, m_regexesCache); + ValidationVisitor validator(stringAdapter, m_context, m_strictTypes, nullptr, m_regexesCache); if (!validator.validateSchema(*constraint.getSubschema())) { return false; } @@ -1155,7 +1155,7 @@ public: newContext.push_back("[" + std::to_string(index) + "]"); // Create a validator for the current array item - ValidationVisitor validationVisitor(item, newContext, m_strictTypes, m_results, m_regexesCache); + ValidationVisitor validationVisitor(item, newContext, m_strictTypes, m_results, m_regexesCache); // Perform validation if (!validationVisitor.validateSchema(*itemsSubschema)) { @@ -1420,7 +1420,7 @@ private: ValidationResults *results, unsigned int *numValidated, bool *validated, - std::unordered_map& regexesCache) + std::unordered_map& regexesCache) : m_arr(arr), m_context(context), m_continueOnSuccess(continueOnSuccess), @@ -1477,7 +1477,7 @@ private: ValidationResults * const m_results; unsigned int * const m_numValidated; bool * const m_validated; - std::unordered_map& m_regexesCache; + std::unordered_map& m_regexesCache; }; /** @@ -1563,7 +1563,7 @@ private: ValidationResults *results, std::set *propertiesMatched, bool *validated, - std::unordered_map& regexesCache) + std::unordered_map& regexesCache) : m_object(object), m_context(context), m_continueOnSuccess(continueOnSuccess), @@ -1640,7 +1640,7 @@ private: ValidationResults * const m_results; std::set * const m_propertiesMatched; bool * const m_validated; - std::unordered_map& m_regexesCache; + std::unordered_map& m_regexesCache; }; /** @@ -1659,7 +1659,7 @@ private: ValidationResults *results, std::set *propertiesMatched, bool *validated, - std::unordered_map& regexesCache) + std::unordered_map& regexesCache) : m_object(object), m_context(context), m_continueOnSuccess(continueOnSuccess), @@ -1716,7 +1716,7 @@ private: ValidationResults * const m_results; std::set * const m_propertiesMatched; bool * const m_validated; - std::unordered_map& m_regexesCache; + std::unordered_map& m_regexesCache; }; /** @@ -1842,7 +1842,7 @@ private: * * @return true if the visitor returns successfully, false otherwise. */ - static bool validationCallback(const constraints::Constraint &constraint, ValidationVisitor &visitor) + static bool validationCallback(const constraints::Constraint &constraint, ValidationVisitor &visitor) { return constraint.accept(visitor); } @@ -1901,7 +1901,7 @@ private: bool m_strictTypes; /// Cached regex objects for pattern constraint - std::unordered_map& m_regexesCache; + std::unordered_map& m_regexesCache; }; } // namespace valijson diff --git a/include/valijson/validator.hpp b/include/valijson/validator.hpp index b60c593..305f44b 100644 --- a/include/valijson/validator.hpp +++ b/include/valijson/validator.hpp @@ -8,10 +8,15 @@ namespace valijson { class Schema; class ValidationResults; + /** - * @brief Class that provides validation functionality. + * @brief Class that provides validation functionality. + * + * @tparam RegexEngine regular expression engine used for pattern constraint validation. + */ -class Validator +template +class ValidatorT { public: enum TypeCheckingMode @@ -23,7 +28,7 @@ public: /** * @brief Construct a Validator that uses strong type checking by default */ - Validator() + ValidatorT() : strictTypes(true) { } /** @@ -31,7 +36,7 @@ public: * * @param typeCheckingMode choice of strong or weak type checking */ - Validator(TypeCheckingMode typeCheckingMode) + ValidatorT(TypeCheckingMode typeCheckingMode) : strictTypes(typeCheckingMode == kStrongTypes) { } /** @@ -58,7 +63,7 @@ public: ValidationResults *results) { // Construct a ValidationVisitor to perform validation at the root level - ValidationVisitor v(target, + ValidationVisitor v(target, std::vector(1, ""), strictTypes, results, regexesCache); return v.validateSchema(schema); @@ -70,7 +75,25 @@ private: bool strictTypes; /// Cached regex objects for pattern constraint. Key - pattern. - std::unordered_map regexesCache; + std::unordered_map regexesCache; }; +/** + * @brief Struct that provides a default Regular Expression Engine using std::regex + * + */ +struct DefaultRegexEngine +{ + DefaultRegexEngine(const std::string& pattern) + : regex(pattern) { } + + static bool search(const std::string& s, const DefaultRegexEngine& r) + { + return std::regex_search(s, r.regex); + } + std::regex regex; +}; + +using Validator = ValidatorT; + } // namespace valijson diff --git a/tests/test_validator_with_custom_regular_expression_engine.cpp b/tests/test_validator_with_custom_regular_expression_engine.cpp new file mode 100644 index 0000000..1fd5e0a --- /dev/null +++ b/tests/test_validator_with_custom_regular_expression_engine.cpp @@ -0,0 +1,128 @@ +#ifdef _MSC_VER +#pragma warning(disable: 4706) +#include +#pragma warning(default: 4706) +#else +#include +#endif + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef VALIJSON_BUILD_POCO_ADAPTER +#include +#include +#endif + +using valijson::adapters::AdapterTraits; +using valijson::adapters::RapidJsonAdapter; +using valijson::Schema; +using valijson::SchemaParser; +using valijson::Validator; + +namespace +{ +void createFileFromContent(const std::string& filename, const std::string& content) +{ + std::ofstream outfile(filename, std::ofstream::out | std::ofstream::trunc); + outfile << content << std::endl; + outfile.close(); +}; + +} + +//Potentially : +// Define a struct CustomRegexEngine that handle both problem and use it as replacement of Validator.. +//using CustomValidator = ValidatorT; + +TEST(valijson, valijson_be_robust_against_bad_regular_expression) +{ + GTEST_SKIP() << "Skipping begin it cause segmentation fault with default Validator"; + const std::string schema = R"( + { + "properties": { + "text": { + "pattern": "^[\\s\\S]+$", + "type": "string" + } + } + } + )"; + + createFileFromContent("schema.json", schema); + rapidjson::Document mySchemaDoc; + ASSERT_TRUE(valijson::utils::loadDocument("schema.json", mySchemaDoc)); + + Schema mySchema; + SchemaParser parser; + RapidJsonAdapter mySchemaAdapter(mySchemaDoc); + parser.populateSchema(mySchemaAdapter, mySchema); + rapidjson::Document myTargetDoc; + std::string payload = "{ \"text\" : \""; + for (int i = 0; i< 100000; ++i) + payload += 'A'; + payload += "\"}"; + + createFileFromContent("payload.json", payload); + + ASSERT_TRUE(valijson::utils::loadDocument("payload.json", myTargetDoc)); + + //This test crash (segfault) is validator is not customized with custom RegexpEngine + Validator validator; + RapidJsonAdapter myTargetAdapter(myTargetDoc); + ASSERT_TRUE(validator.validate(mySchema, myTargetAdapter, nullptr)); +} + +TEST(valijson, valijson_be_robust_against_catastrophic_backtracking_regular_expression) +{ + GTEST_SKIP() << "Skipping begin it hangs due to non management of catastrophic backtracking with default Validator"; + + const std::string schema = R"( + { + "properties": { + "text": { + "pattern": "((A+)*)+$", + "type": "string" + } + } + } + )"; + + createFileFromContent("schema.json", schema); + rapidjson::Document mySchemaDoc; + ASSERT_TRUE(valijson::utils::loadDocument("schema.json", mySchemaDoc)); + + Schema mySchema; + SchemaParser parser; + RapidJsonAdapter mySchemaAdapter(mySchemaDoc); + parser.populateSchema(mySchemaAdapter, mySchema); + rapidjson::Document myTargetDoc; + std::string payload = "{ \"text\" : \"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC\"}"; + + createFileFromContent("payload.json", payload); + + ASSERT_TRUE(valijson::utils::loadDocument("payload.json", myTargetDoc)); + + //This test takes endless time if validator is not customized with custom RegexpEngine + Validator validator; + RapidJsonAdapter myTargetAdapter(myTargetDoc); + + //payload is correct regarding the regexp but evaluation is impossible due to catastrophic regexp bactracking. so we return false. + ASSERT_FALSE(validator.validate(mySchema, myTargetAdapter, nullptr)); +}