regex.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. #pragma once
  2. // NOLINTBEGIN(readability-implicit-bool-conversion)
  3. #include <exception> // IWYU pragma: keep
  4. #include <memory> // IWYU pragma: keep
  5. #include <regex>
  6. #include <string>
  7. #include <string_view>
  8. #include <unordered_map>
  9. #include <jvalidate/_macro.h>
  10. #if JVALIDATE_HAS_ICU
  11. #include <unicode/parseerr.h>
  12. #include <unicode/regex.h>
  13. #include <unicode/stringpiece.h>
  14. #include <unicode/unistr.h>
  15. #include <unicode/utypes.h>
  16. #endif
  17. namespace jvalidate {
  18. /**
  19. * @brief An implementation of a regular expression "engine", for use with
  20. * constraints like "pattern" and "patternProperties".
  21. * Uses std::regex as its underlying implementation.
  22. *
  23. * While being std::regex means that it is the most sensible choice for a
  24. * default RegexEngine, the performance of std::regex is generally the worst
  25. * among C++ regex utilities, and it struggles to compile several patterns.
  26. * See https://stackoverflow.com/questions/70583395/ for an explaination.
  27. *
  28. * If you need to use complicated patterns in your json schema, provide a
  29. * RegexEngine compatible wrapper for a different library, such as re2.
  30. * std::regex does not support graphemes, meaning that multi-byte characters
  31. * will need to wrapped in groups if you want to repeat them.
  32. *
  33. * Regular expressions are compiled using the default ECMAScript flags, which
  34. * is almost, but not quite, compliant with the ECMA-262 standard.
  35. */
  36. class StdRegexEngine {
  37. private:
  38. std::unordered_map<std::string, std::regex> cache_;
  39. public:
  40. static std::string_view engine_name() { return "std::regex[ECMAScript]"; }
  41. static bool is_regex(std::string_view regex) try {
  42. return (std::regex(std::string(regex)), true);
  43. } catch (std::exception const &) { return false; }
  44. bool search(std::string const & regex, std::string const & text) try {
  45. std::regex const & rexpr = cache_.try_emplace(regex, regex).first->second;
  46. return std::regex_search(text, rexpr);
  47. } catch (std::exception const &) { return false; }
  48. };
  49. }
  50. #if JVALIDATE_HAS_ICU
  51. namespace jvalidate {
  52. /**
  53. * @brief An implementation of a regular expression "engine", for use with
  54. * constraints like "pattern" and "patternProperties".
  55. * Uses the "International Components for Unicode" (icu4c) library for its
  56. * underlying implementation.
  57. *
  58. * These regular expressions operate on the level of graphemes, rather than
  59. * characters. This means that multi-byte characters like emojis will be
  60. * treated as singular characters for the purpose of "character sets" and
  61. * repetition operators.
  62. *
  63. * This regex engine is not ECMA-262 compliant, which means that certain cases
  64. * will not be recognized. This is a notice rather than a true issue, since
  65. * many other languages' regex libraries (e.g. Python) are also not ECMA-262
  66. * compliant.
  67. *
  68. * This means that we pass test cases that ECMAScript rejects, such as:
  69. * - i18n digit characters are captured by \\d
  70. * - i18n characters can be matched by \\w (if they are i18nword chars)
  71. */
  72. class ICURegexEngine {
  73. private:
  74. std::unordered_map<std::string, std::unique_ptr<icu::RegexPattern>> cache_;
  75. public:
  76. static std::string_view engine_name() { return "icu::RegexPattern"; }
  77. static bool is_regex(std::string_view regex) {
  78. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
  79. UErrorCode status = U_ZERO_ERROR;
  80. UParseError perr;
  81. std::unique_ptr<icu::RegexPattern> const tmp(icu::RegexPattern::compile(ucs, perr, status));
  82. return not U_FAILURE(status);
  83. }
  84. bool search(std::string const & regex, std::string const & text) {
  85. auto [it, created] = cache_.try_emplace(regex, nullptr);
  86. if (created) {
  87. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
  88. UErrorCode status = U_ZERO_ERROR;
  89. UParseError perr;
  90. it->second.reset(icu::RegexPattern::compile(ucs, perr, status));
  91. if (U_FAILURE(status)) {
  92. // TODO(samjaffe): Provide info?
  93. return false;
  94. }
  95. }
  96. if (it->second == nullptr) {
  97. return false; // Regex was invalid - and we cached that
  98. }
  99. UErrorCode status = U_ZERO_ERROR;
  100. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(text));
  101. std::unique_ptr<icu::RegexMatcher> matcher(it->second->matcher(ucs, status));
  102. JVALIDATE_RETURN_IF(U_FAILURE(status), false); // Doesn't appear possilbe
  103. return matcher->find(status);
  104. }
  105. };
  106. }
  107. #endif
  108. // NOLINTEND(readability-implicit-bool-conversion)