소스 검색

refactor: switch utf8/utf32 conversions to use idna over icu

Sam Jaffe 7 달 전
부모
커밋
7fe69c0e39
4개의 변경된 파일53개의 추가작업 그리고 47개의 파일을 삭제
  1. 9 0
      include/jvalidate/_config.h
  2. 39 42
      include/jvalidate/detail/string.h
  3. 3 3
      include/jvalidate/format.h
  4. 2 2
      include/jvalidate/validation_visitor.h

+ 9 - 0
include/jvalidate/_config.h

@@ -0,0 +1,9 @@
+#pragma once
+
+#if __has_include(<unicode/std_string.h>)
+#define JVALIDATE_HAS_ICU
+#endif
+
+#if __has_include(<ada/idna/to_unicode.h>)
+#define JVALIDATE_HAS_IDNA
+#endif

+ 39 - 42
include/jvalidate/detail/string.h

@@ -3,68 +3,65 @@
  * std::string/std::regex is not well suited for UTF8 comprehensions.
  */
 #pragma once
+#include <jvalidate/_config.h>
 
-#include <ostream>
+#include <memory>
 #include <string>
 #include <string_view>
-#if __has_include(<unicode/std_string.h>)
-#define JVALIDATE_HAS_ICU
-#include <unicode/brkiter.h>
-#include <unicode/unistr.h>
+
+#ifdef JVALIDATE_HAS_IDNA
+#include <ada/idna/unicode_transcoding.h>
 #endif
 
 #include <jvalidate/detail/expect.h>
 
-#ifdef JVALIDATE_HAS_ICU
 namespace jvalidate::detail {
+inline size_t length_u8(std::string_view arg) { return arg.length(); }
+inline size_t length_u32(std::u32string_view arg) { return arg.length(); }
+
+inline std::string_view to_u8(std::string_view arg) { return arg; }
+inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
+}
+
+#ifdef JVALIDATE_HAS_IDNA
+namespace jvalidate::detail {
+/**
+ * @brief Calclates the string-length of the argument, treating multi-byte
+ * characters as their individual bytes (as if the string was a std::string).
+ *
+ * @param arg A string encoded in UTF32
+ *
+ * @returns A number no greater than 4 * arg.length(), depending on the number
+ * of graphemes/codepoints in the string.
+ */
+inline size_t length_u8(std::u32string_view arg) {
+  return ada::idna::utf8_length_from_utf32(arg.data(), arg.length());
+}
+
 /**
  * @brief Calclates the string-length of the argument, treating multi-byte
- * characters an unicode graphemes as single characters (which std::string
+ * characters and unicode graphemes as single characters (which std::string
  * cannot do).
  *
  * @param arg Any UTF8 compatible string (including a standard ASCII string)
  *
- * @returns A number no greater than arg.size(), depending on the number of
+ * @returns A number no greater than arg.length(), depending on the number of
  * graphemes/codepoints in the string.
  */
-inline size_t length(std::string_view arg) {
-  icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
-  return ucs.countChar32();
-}
-
-inline std::string_view to_u8(std::string_view arg) { return arg; }
-
-inline std::string to_u8(std::u32string_view arg) {
-  icu::UnicodeString const ucs =
-      icu::UnicodeString::fromUTF32(reinterpret_cast<int const *>(arg.data()), arg.size());
-  std::string out;
-  return ucs.toUTF8String(out);
+inline size_t length_u32(std::string_view arg) {
+  return ada::idna::utf32_length_from_utf8(arg.data(), arg.length());
 }
 
-inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
-
-inline std::u32string to_u32(std::string_view arg) {
-  icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
-
-  std::u32string rval;
-  size_t const capacity = ucs.countChar32();
-  rval.resize(capacity);
-
-  UErrorCode status = U_ZERO_ERROR;
-  ucs.toUTF32(reinterpret_cast<int *>(rval.data()), capacity, status);
-  // This should never occur - unless there's like an alloc error
-  if (U_FAILURE(status)) {
-    JVALIDATE_THROW(std::runtime_error, "UTF-32 Translation Error");
-  }
-
-  return rval;
-}
+inline std::string to_u8(std::u32string_view str) {
+  auto data = std::make_unique_for_overwrite<char[]>(4 * str.length());
+  size_t bytes = ada::idna::utf32_to_utf8(str.data(), str.length(), data.get());
+  return std::string(data.get(), data.get() + bytes);
 }
 
-namespace std {
-inline std::ostream & operator<<(std::ostream & os, std::u32string_view str) {
-  return os << jvalidate::detail::to_u8(str);
+inline std::u32string to_u32(std::string_view str) {
+  auto data = std::make_unique_for_overwrite<char32_t[]>(str.length());
+  size_t bytes = ada::idna::utf8_to_utf32(str.data(), str.length(), data.get());
+  return std::u32string(data.get(), data.get() + bytes);
 }
 }
-#else
 #endif

+ 3 - 3
include/jvalidate/format.h

@@ -1,4 +1,5 @@
 #pragma once
+#include <jvalidate/_config.h>
 
 #include <cctype>
 #include <chrono>
@@ -10,8 +11,7 @@
 #include <unordered_map>
 #include <utility>
 
-#if __has_include(<ada/idna/to_unicode.h>)
-#define JVALIDATE_HAS_IDNA
+#ifdef JVALIDATE_HAS_IDNA
 #include <ada/idna/to_unicode.h>
 #include <ada/idna/validity.h>
 #endif
@@ -179,7 +179,7 @@ template <typename CharT> bool is_invalid_host_char(CharT c) {
 template <typename CharT>
 bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
   using delim = detail::char_delimiters<CharT>;
-  return (name.empty() || detail::to_u8(name).size() >= 64 ||
+  return (name.empty() || detail::length_u8(name) >= 64 ||
           (name.size() >= 4 && name.substr(2).starts_with(delim::illegal_dashes_ulabel)) ||
           name[0] == '-' || name.back() == '-');
 }

+ 2 - 2
include/jvalidate/validation_visitor.h

@@ -246,7 +246,7 @@ public:
   Status visit(constraint::MaxLengthConstraint const & cons, Adapter auto const & document) const {
     NOOP_UNLESS_TYPE(String);
     std::string const str = document.as_string();
-    if (int64_t len = detail::length(str); len > cons.value) {
+    if (int64_t len = detail::length_u32(str); len > cons.value) {
       return result(Status::Reject, "string of length ", len, " is >", cons.value);
     } else {
       return result(Status::Accept, "string of length ", len, " is <=", cons.value);
@@ -256,7 +256,7 @@ public:
   Status visit(constraint::MinLengthConstraint const & cons, Adapter auto const & document) const {
     NOOP_UNLESS_TYPE(String);
     std::string const str = document.as_string();
-    if (int64_t len = detail::length(str); len < cons.value) {
+    if (int64_t len = detail::length_u32(str); len < cons.value) {
       return result(Status::Reject, "string of length ", len, " is <", cons.value);
     } else {
       return result(Status::Accept, "string of length ", len, " is >=", cons.value);