regex.h 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #pragma once
  2. #include <regex>
  3. #include <unordered_map>
  4. #include <jvalidate/_macro.h>
  5. #if JVALIDATE_HAS_ICU
  6. #include <unicode/regex.h>
  7. #include <unicode/ustring.h>
  8. #include <unicode/utypes.h>
  9. #endif
  10. namespace jvalidate {
  11. /**
  12. * @brief An implementation of a regular expression "engine", for use with
  13. * constraints like "pattern" and "patternProperties".
  14. * Uses std::regex as its underlying implementation.
  15. *
  16. * While being std::regex means that it is the most sensible choice for a
  17. * default RegexEngine, the performance of std::regex is generally the worst
  18. * among C++ regex utilities, and it struggles to compile several patterns.
  19. * See https://stackoverflow.com/questions/70583395/ for an explaination.
  20. *
  21. * If you need to use complicated patterns in your json schema, provide a
  22. * RegexEngine compatible wrapper for a different library, such as re2.
  23. * std::regex does not support graphemes, meaning that multi-byte characters
  24. * will need to wrapped in groups if you want to repeat them.
  25. *
  26. * Regular expressions are compiled using the default ECMAScript flags, which
  27. * is almost, but not quite, compliant with the ECMA-262 standard.
  28. */
  29. class StdRegexEngine {
  30. private:
  31. std::unordered_map<std::string, std::regex> cache_;
  32. public:
  33. static std::string_view engine_name() { return "std::regex[ECMAScript]"; }
  34. static bool is_regex(std::string_view regex) {
  35. try {
  36. [[maybe_unused]] std::regex _{std::string(regex)};
  37. return true;
  38. } catch (std::exception const &) { return false; }
  39. }
  40. bool search(std::string const & regex, std::string const & text) {
  41. auto const & re = cache_.try_emplace(regex, regex).first->second;
  42. return std::regex_search(text, re);
  43. }
  44. };
  45. }
  46. #if JVALIDATE_HAS_ICU
  47. namespace jvalidate {
  48. /**
  49. * @brief An implementation of a regular expression "engine", for use with
  50. * constraints like "pattern" and "patternProperties".
  51. * Uses the "International Components for Unicode" (icu4c) library for its
  52. * underlying implementation.
  53. *
  54. * These regular expressions operate on the level of graphemes, rather than
  55. * characters. This means that multi-byte characters like emojis will be
  56. * treated as singular characters for the purpose of "character sets" and
  57. * repetition operators.
  58. *
  59. * This regex engine is not ECMA-262 compliant, which means that certain cases
  60. * will not be recognized. This is a notice rather than a true issue, since
  61. * many other languages' regex libraries (e.g. Python) are also not ECMA-262
  62. * compliant.
  63. *
  64. * This means that we pass test cases that ECMAScript rejects, such as:
  65. * - i18n digit characters are captured by \\d
  66. * - i18n characters can be matched by \\w (if they are i18nword chars)
  67. */
  68. class ICURegexEngine {
  69. private:
  70. std::unordered_map<std::string, std::unique_ptr<icu::RegexPattern>> cache_;
  71. public:
  72. static std::string_view engine_name() { return "icu::RegexPattern"; }
  73. static bool is_regex(std::string_view regex) {
  74. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
  75. UErrorCode status = U_ZERO_ERROR;
  76. UParseError pe;
  77. std::unique_ptr<icu::RegexPattern> tmp(icu::RegexPattern::compile(ucs, pe, status));
  78. return not U_FAILURE(status);
  79. }
  80. bool search(std::string const & regex, std::string const & text) {
  81. auto [it, created] = cache_.try_emplace(regex, nullptr);
  82. if (created) {
  83. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
  84. UErrorCode status = U_ZERO_ERROR;
  85. UParseError pe;
  86. it->second.reset(icu::RegexPattern::compile(ucs, pe, status));
  87. if (U_FAILURE(status)) {
  88. // TODO: Provide info?
  89. return false;
  90. }
  91. }
  92. UErrorCode status = U_ZERO_ERROR;
  93. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(text));
  94. std::unique_ptr<icu::RegexMatcher> matcher(it->second->matcher(ucs, status));
  95. if (U_FAILURE(status)) {
  96. return false;
  97. }
  98. return matcher->find(status);
  99. }
  100. };
  101. }
  102. #endif