#pragma once #include #include #include #if JVALIDATE_HAS_ICU #include #include #include #endif namespace jvalidate { /** * @brief An implementation of a regular expression "engine", for use with * constraints like "pattern" and "patternProperties". * Uses std::regex as its underlying implementation. * * While being std::regex means that it is the most sensible choice for a * default RegexEngine, the performance of std::regex is generally the worst * among C++ regex utilities, and it struggles to compile several patterns. * See https://stackoverflow.com/questions/70583395/ for an explaination. * * If you need to use complicated patterns in your json schema, provide a * RegexEngine compatible wrapper for a different library, such as re2. * std::regex does not support graphemes, meaning that multi-byte characters * will need to wrapped in groups if you want to repeat them. * * Regular expressions are compiled using the default ECMAScript flags, which * is almost, but not quite, compliant with the ECMA-262 standard. */ class StdRegexEngine { private: std::unordered_map cache_; public: static std::string_view engine_name() { return "std::regex[ECMAScript]"; } static bool is_regex(std::string_view regex) try { return (std::regex(std::string(regex)), true); } catch (std::exception const &) { return false; } bool search(std::string const & regex, std::string const & text) try { std::regex const & re = cache_.try_emplace(regex, regex).first->second; return std::regex_search(text, re); } catch (std::exception const &) { return false; } }; } #if JVALIDATE_HAS_ICU namespace jvalidate { /** * @brief An implementation of a regular expression "engine", for use with * constraints like "pattern" and "patternProperties". * Uses the "International Components for Unicode" (icu4c) library for its * underlying implementation. * * These regular expressions operate on the level of graphemes, rather than * characters. This means that multi-byte characters like emojis will be * treated as singular characters for the purpose of "character sets" and * repetition operators. * * This regex engine is not ECMA-262 compliant, which means that certain cases * will not be recognized. This is a notice rather than a true issue, since * many other languages' regex libraries (e.g. Python) are also not ECMA-262 * compliant. * * This means that we pass test cases that ECMAScript rejects, such as: * - i18n digit characters are captured by \\d * - i18n characters can be matched by \\w (if they are i18nword chars) */ class ICURegexEngine { private: std::unordered_map> cache_; public: static std::string_view engine_name() { return "icu::RegexPattern"; } static bool is_regex(std::string_view regex) { icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex)); UErrorCode status = U_ZERO_ERROR; UParseError pe; std::unique_ptr tmp(icu::RegexPattern::compile(ucs, pe, status)); return not U_FAILURE(status); } bool search(std::string const & regex, std::string const & text) { auto [it, created] = cache_.try_emplace(regex, nullptr); if (created) { icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex)); UErrorCode status = U_ZERO_ERROR; UParseError pe; it->second.reset(icu::RegexPattern::compile(ucs, pe, status)); if (U_FAILURE(status)) { // TODO: Provide info? return false; } } if (it->second == nullptr) { return false; // Regex was invalid - and we cached that } UErrorCode status = U_ZERO_ERROR; icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(text)); std::unique_ptr matcher(it->second->matcher(ucs, status)); JVALIDATE_RETURN_IF(U_FAILURE(status), false); // Doesn't appear possilbe return matcher->find(status); } }; } #endif