Sfoglia il codice sorgente

feat: add support for ICU Regex, remove regex fuzzing nonsense for tricking std::regex into working w/ emojis

Sam Jaffe 2 settimane fa
parent
commit
76e93c5ba1

+ 1 - 1
Makefile

@@ -13,7 +13,7 @@ CXX_FLAGS := -Wall -Wextra -Werror -std=c++20 \
 	     -isystem include/ -I/opt/homebrew/opt/icu4c/include \
 	     -DJVALIDATE_USE_EXCEPTIONS -DJVALIDATE_LOAD_FAILURE_AS_FALSE_SCHEMA
 
-LD_FLAGS := -L/opt/homebrew/lib -L/opt/homebrew/opt/icu4c/lib -licuuc
+LD_FLAGS := -L/opt/homebrew/lib -L/opt/homebrew/opt/icu4c/lib -licuuc -licui18n
 
 TEST_DIR := tests/
 INCLUDE_DIR := include/

+ 7 - 0
include/jvalidate/_config.h

@@ -0,0 +1,7 @@
+#pragma once
+
+#if __has_include(<unicode/std_string.h>)
+#define JVALIDATE_HAS_ICU 1
+#else
+#define JVALIDATE_HAS_ICU 0
+#endif

+ 10 - 0
include/jvalidate/_macro.h

@@ -0,0 +1,10 @@
+#pragma once
+
+#include <jvalidate/_config.h>
+
+#define JVALIDATE_CONCAT2(A, B) A##B
+#define JVALIDATE_CONCAT(A, B) JVALIDATE_CONCAT2(A, B)
+
+#define JVALIDATE_IIF0(IF, ELSE) ELSE
+#define JVALIDATE_IIF1(IF, ELSE) IF
+#define JVALIDATE_IIF(CONDITIONAL, IF, ELSE) JVALIDATE_CONCAT(JVALIDATE_IIF, CONDITIONAL)(IF, ELSE)

+ 6 - 2
include/jvalidate/detail/pointer.h

@@ -3,13 +3,14 @@
 #include <algorithm>
 #include <cassert>
 #include <iostream>
-#include <jvalidate/detail/number.h>
 #include <string>
 #include <string_view>
 #include <variant>
 #include <vector>
 
 #include <jvalidate/compat/compare.h>
+#include <jvalidate/detail/expect.h>
+#include <jvalidate/detail/number.h>
 #include <jvalidate/forward.h>
 
 namespace jvalidate::detail {
@@ -63,8 +64,9 @@ public:
         // than '/' and '~' to be handled in all contexts.
         // TODO(samjaffe): Only do this if enc is hex-like (currently throws?)
         if (in[i] == '%') {
-          char const enc[3] = {in[i + 1], in[i + 2], '\0'};
+          std::string_view enc = std::string_view(in).substr(i + 1, 2);
           in.replace(i, 3, 1, from_str<char>(enc, 16));
+          continue;
         } else if (in[i] != '~') {
           // Not a special char-sequence, does not need massaging
           continue;
@@ -77,6 +79,8 @@ public:
           in.replace(i, 2, 1, '~');
         } else if (in[i + 1] == '1') {
           in.replace(i, 2, 1, '/');
+        } else {
+          JVALIDATE_THROW(std::runtime_error, "Illegal ~ code");
         }
       }
       tokens_.push_back(std::move(in));

+ 1 - 2
include/jvalidate/detail/scoped_state.h

@@ -3,8 +3,7 @@
 #include <functional>
 #include <type_traits>
 
-#define JVALIDATE_CONCAT2(A, B) A##B
-#define JVALIDATE_CONCAT(A, B) JVALIDATE_CONCAT2(A, B)
+#include <jvalidate/_macro.h>
 
 /**
  * @breif Create an anonymous scoped state object, which represents a temporary

+ 9 - 69
include/jvalidate/detail/string.h

@@ -3,93 +3,33 @@
  * std::string/std::regex is not well suited for UTF8 comprehensions.
  */
 #pragma once
+#include <jvalidate/_config.h>
 
-#if __has_include(<unicode/std_string.h>)
-#define JVALIDATE_HAS_ICU
+#include <cstring>
+
+#if JVALIDATE_HAS_ICU
 #include <unicode/brkiter.h>
 #include <unicode/unistr.h>
 #endif
-#include <cstring>
 
 namespace jvalidate::detail {
 /**
  * @brief Calclates the string-length of the argument, treating multi-byte
- * characters an unicode graphemes as single characters (which std::string
+ * characters and unicode graphemes as single characters (which std::string
  * cannot do).
  *
  * @param arg Any UTF8 compatible string (including a standard ASCII string)
  *
- * @returns A number no greater than arg.size(), depending on the number of
+ * @returns A number no greater than arg.length(), depending on the number of
  * graphemes/codepoints in the string.
  */
+#if JVALIDATE_HAS_ICU
 inline size_t length(std::string_view arg) {
-#ifdef JVALIDATE_HAS_ICU
   icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
   return ucs.countChar32();
-#else
-  return arg.size();
-#endif
 }
-
-/**
- * @brief Ensures that any codepoints/graphemes in the given regular expression
- * are wrapped in parenthesis in order to ensure that e.g. <PIRATE-EMOJI>*
- * properly matches the entire emoji multiple times, instead of just the last
- * byte of the string.
- *
- * Because we are only performing a regex search, and not matching/capturing
- * groups - we don't care that all of these extra parenthesis cause us to
- * generate new capture-groups or push some of the groups to a later point.
- *
- * @param arg A regular expression string, to be sanitized for UTF8 pattern-
- * matching.
- *
- * @returns The regular expression, with some more parenthesis added.
- */
-inline std::string regex_escape(std::string_view arg) {
-#ifdef JVALIDATE_HAS_ICU
-  icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
-  // Short-circuit if there are no multi-byte codepoints or graphemes, since
-  // C++ regexes don't have any problems with those.
-  if (ucs.countChar32() == arg.size()) {
-    return std::string(arg);
-  }
-
-  UErrorCode status = U_ZERO_ERROR;
-  // createCharacterInstance directly uses new - without any special allocation
-  // rules or cleanup, since the first argument is NULL.
-  std::unique_ptr<icu::BreakIterator> iter(
-      icu::BreakIterator::createCharacterInstance(NULL, status));
-
-  // This should never occur - unless there's like an alloc error
-  if (U_FAILURE(status)) {
-    return std::string(arg);
-  }
-
-  icu::UnicodeString rval;
-  iter->setText(ucs);
-  int32_t start = iter->first();
-  int32_t end = iter->next();
-  while (end != icu::BreakIterator::DONE) {
-    // 0-or-1, 1-or-more, 0-or-more markings
-    // This could be optimized to only operate when on a multibyte character
-    if (std::strchr("?*+", ucs.charAt(end))) {
-      rval.append('(');
-      rval.append(ucs, start, end - start);
-      rval.append(')');
-      rval.append(ucs.char32At(end));
-      end = iter->next();
-    } else {
-      rval.append(ucs, start, end - start);
-    }
-    start = end;
-    end = iter->next();
-  }
-
-  std::string out;
-  return rval.toUTF8String(out);
 #else
-  return std::string(arg);
+
+inline size_t length(std::string_view arg) { return arg.length(); }
 #endif
 }
-}

+ 3 - 3
include/jvalidate/forward.h

@@ -172,9 +172,9 @@ concept MutableAdapter = Adapter<A> && requires(A const a) {
 };
 
 template <typename R>
-concept RegexEngine = requires(R & regex) {
-  { regex.engine_name() } -> std::convertible_to<std::string_view>;
-  { regex.search("" /* pattern */, "" /* text */) } -> std::same_as<bool>;
+concept RegexEngine = requires(R & engine) {
+  { R::engine_name() } -> std::convertible_to<std::string_view>;
+  { engine.search("" /* pattern */, "" /* text */) } -> std::same_as<bool>;
 };
 
 template <typename E, typename A, typename B, typename V>

+ 93 - 0
include/jvalidate/regex.h

@@ -0,0 +1,93 @@
+#pragma once
+
+#include <regex>
+#include <unordered_map>
+
+#include <jvalidate/_macro.h>
+
+#if JVALIDATE_HAS_ICU
+#include <unicode/regex.h>
+#include <unicode/ustring.h>
+#include <unicode/utypes.h>
+#endif
+
+namespace jvalidate {
+/**
+ * @brief An implementation of a regular expression "engine", for use with
+ * constraints like "pattern" and "patternProperties".
+ * Uses std::regex as its underlying implementation.
+ *
+ * While being std::regex means that it is the most sensible choice for a
+ * default RegexEngine, the performance of std::regex is generally the worst
+ * among C++ regex utilities, and it struggles to compile several patterns.
+ * See https://stackoverflow.com/questions/70583395/ for an explaination.
+ *
+ * If you need to use complicated patterns in your json schema, provide a
+ * RegexEngine compatible wrapper for a different library, such as re2.
+ */
+class StdRegexEngine {
+private:
+  std::unordered_map<std::string, std::regex> cache_;
+
+public:
+  static std::string_view engine_name() { return "std::regex[ECMAScript]"; }
+
+  static bool is_regex(std::string_view regex) {
+    try {
+      [[maybe_unused]] std::regex _{std::string(regex)};
+      return true;
+    } catch (std::exception const &) { return false; }
+  }
+
+  bool search(std::string const & regex, std::string const & text) {
+    auto const & re = cache_.try_emplace(regex, regex).first->second;
+    return std::regex_search(text, re);
+  }
+};
+}
+
+#if JVALIDATE_HAS_ICU
+namespace jvalidate {
+class ICURegexEngine {
+private:
+  std::unordered_map<std::string, std::unique_ptr<icu::RegexPattern>> cache_;
+
+public:
+  static std::string_view engine_name() { return "icu::RegexPattern"; }
+
+  static bool is_regex(std::string_view regex) {
+    icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
+
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    std::unique_ptr<icu::RegexPattern> tmp(icu::RegexPattern::compile(ucs, pe, status));
+
+    return not U_FAILURE(status);
+  }
+
+  bool search(std::string const & regex, std::string const & text) {
+    auto [it, created] = cache_.try_emplace(regex, nullptr);
+    if (created) {
+      icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(regex));
+
+      UErrorCode status = U_ZERO_ERROR;
+      UParseError pe;
+      it->second.reset(icu::RegexPattern::compile(ucs, pe, status));
+
+      if (U_FAILURE(status)) {
+        // TODO: Provide info?
+        return false;
+      }
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(text));
+    std::unique_ptr<icu::RegexMatcher> matcher(it->second->matcher(ucs, status));
+    if (U_FAILURE(status)) {
+      return false;
+    }
+    return matcher->find(status);
+  }
+};
+}
+#endif

+ 2 - 29
include/jvalidate/validator.h

@@ -1,40 +1,13 @@
 #pragma once
 
-#include <regex>
-#include <unordered_map>
-
 #include <jvalidate/detail/on_block_exit.h>
 #include <jvalidate/forward.h>
+#include <jvalidate/regex.h>
 #include <jvalidate/status.h>
 #include <jvalidate/validation_config.h>
 #include <jvalidate/validation_visitor.h>
 
 namespace jvalidate::detail {
-/**
- * @brief An implementation of a regular expression "engine", for use with
- * constraints like "pattern" and "patternProperties".
- * Uses std::regex as its underlying implementation.
- *
- * While being std::regex means that it is the most sensible choice for a
- * default RegexEngine, the performance of std::regex is generally the worst
- * among C++ regex utilities, and it struggles to compile several patterns.
- * See https://stackoverflow.com/questions/70583395/ for an explaination.
- *
- * If you need to use complicated patterns in your json schema, provide a
- * RegexEngine compatible wrapper for a different library, such as re2.
- */
-class StdRegexEngine {
-private:
-  std::unordered_map<std::string, std::regex> cache_;
-
-public:
-  static std::string_view engine_name() { return "std::regex[ECMAScript]"; }
-  bool search(std::string const & regex, std::string const & text) {
-    auto const & re = cache_.try_emplace(regex, regex).first->second;
-    return std::regex_search(text, re);
-  }
-};
-
 /**
  * @brief An implementation of an "Extension Constraint Visitor" plugin that
  * does nothing.
@@ -49,7 +22,7 @@ namespace jvalidate {
  *
  * @tparam RE A type that can be used to solve regular expressions
  */
-template <RegexEngine RE = detail::StdRegexEngine,
+template <RegexEngine RE = JVALIDATE_IIF(JVALIDATE_HAS_ICU, ICURegexEngine, StdRegexEngine),
           typename ExtensionVisitor = detail::StubExtensionVisitor>
 class Validator {
 private: