| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- #pragma once
- // NOLINTBEGIN(readability-implicit-bool-conversion)
- #include <exception> // IWYU pragma: keep
- #include <memory> // IWYU pragma: keep
- #include <regex>
- #include <string>
- #include <string_view>
- #include <unordered_map>
- #include <jvalidate/_macro.h>
- #if JVALIDATE_HAS_ICU
- #include <unicode/parseerr.h>
- #include <unicode/regex.h>
- #include <unicode/stringpiece.h>
- #include <unicode/unistr.h>
- #include <unicode/utypes.h>
- #endif
- namespace jvalidate {
- /**
- * @brief An implementation of a regular expression "engine", for use with
- * constraints like "pattern" and "patternProperties".
- * Uses std::regex as its underlying implementation.
- *
- * While being std::regex means that it is the most sensible choice for a
- * default RegexEngine, the performance of std::regex is generally the worst
- * among C++ regex utilities, and it struggles to compile several patterns.
- * See https://stackoverflow.com/questions/70583395/ for an explaination.
- *
- * If you need to use complicated patterns in your json schema, provide a
- * RegexEngine compatible wrapper for a different library, such as re2.
- * std::regex does not support graphemes, meaning that multi-byte characters
- * will need to wrapped in groups if you want to repeat them.
- *
- * Regular expressions are compiled using the default ECMAScript flags, which
- * is almost, but not quite, compliant with the ECMA-262 standard.
- */
- class StdRegexEngine {
- private:
- std::unordered_map<std::string, std::regex> cache_;
- public:
- static std::string_view engine_name() { return "std::regex[ECMAScript]"; }
- static bool is_regex(std::string_view regex) try {
- return (std::regex(std::string(regex)), true);
- } catch (std::exception const &) { return false; }
- bool search(std::string const & regex, std::string const & text) try {
- std::regex const & rexpr = cache_.try_emplace(regex, regex).first->second;
- return std::regex_search(text, rexpr);
- } catch (std::exception const &) { return false; }
- };
- }
- #if JVALIDATE_HAS_ICU
- namespace jvalidate {
- /**
- * @brief An implementation of a regular expression "engine", for use with
- * constraints like "pattern" and "patternProperties".
- * Uses the "International Components for Unicode" (icu4c) library for its
- * underlying implementation.
- *
- * These regular expressions operate on the level of graphemes, rather than
- * characters. This means that multi-byte characters like emojis will be
- * treated as singular characters for the purpose of "character sets" and
- * repetition operators.
- *
- * This regex engine is not ECMA-262 compliant, which means that certain cases
- * will not be recognized. This is a notice rather than a true issue, since
- * many other languages' regex libraries (e.g. Python) are also not ECMA-262
- * compliant.
- *
- * This means that we pass test cases that ECMAScript rejects, such as:
- * - i18n digit characters are captured by \\d
- * - i18n characters can be matched by \\w (if they are i18nword chars)
- */
- class ICURegexEngine {
- private:
- std::unordered_map<std::string, std::unique_ptr<icu::RegexPattern>> cache_;
- public:
- static std::string_view engine_name() { return "icu::RegexPattern"; }
- static bool is_regex(std::string_view regex) {
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
- UErrorCode status = U_ZERO_ERROR;
- UParseError perr;
- std::unique_ptr<icu::RegexPattern> const tmp(icu::RegexPattern::compile(ucs, perr, status));
- return not U_FAILURE(status);
- }
- bool search(std::string const & regex, std::string const & text) {
- auto [it, created] = cache_.try_emplace(regex, nullptr);
- if (created) {
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
- UErrorCode status = U_ZERO_ERROR;
- UParseError perr;
- it->second.reset(icu::RegexPattern::compile(ucs, perr, status));
- if (U_FAILURE(status)) {
- // TODO(samjaffe): Provide info?
- return false;
- }
- }
- if (it->second == nullptr) {
- return false; // Regex was invalid - and we cached that
- }
- UErrorCode status = U_ZERO_ERROR;
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(text));
- std::unique_ptr<icu::RegexMatcher> matcher(it->second->matcher(ucs, status));
- JVALIDATE_RETURN_IF(U_FAILURE(status), false); // Doesn't appear possilbe
- return matcher->find(status);
- }
- };
- }
- #endif
- // NOLINTEND(readability-implicit-bool-conversion)
|