Explorar o código

feat: implement hostname/idn-hostname

Sam Jaffe hai 7 meses
pai
achega
58959a21ee

+ 3 - 0
.gitmodules

@@ -1,3 +1,6 @@
 [submodule "thirdparty/JSON-Schema-Test-Suite"]
 	path = thirdparty/JSON-Schema-Test-Suite
 	url = git@github.com:json-schema-org/JSON-Schema-Test-Suite.git
+[submodule "thirdparty/idna"]
+	path = thirdparty/idna
+	url = https://github.com/ada-url/idna.git

+ 1 - 1
Makefile

@@ -13,7 +13,7 @@ CXX_FLAGS := -Wall -Wextra -Werror -std=c++20 \
 	     -isystem include/ -I/opt/homebrew/opt/icu4c/include \
 	     -DJVALIDATE_USE_EXCEPTIONS -DJVALIDATE_LOAD_FAILURE_AS_FALSE_SCHEMA
 
-LD_FLAGS := -L/opt/homebrew/lib -L/opt/homebrew/opt/icu4c/lib -licuuc
+LD_FLAGS := -L/opt/homebrew/lib -L/opt/homebrew/opt/icu4c/lib -licuuc -lada-idna
 
 TEST_DIR := tests/
 INCLUDE_DIR := include/

+ 123 - 0
include/jvalidate/detail/idna_special_cases.h

@@ -0,0 +1,123 @@
+#pragma once
+#include <string_view>
+
+#include <jvalidate/detail/string.h>
+#include <jvalidate/forward.h>
+
+namespace jvalidate::format::detail {
+template <typename CharT> struct special_case {
+  std::basic_string_view<CharT> target;
+  bool (*accepts_at)(std::basic_string_view<CharT>, size_t);
+
+  bool accepts(std::basic_string_view<CharT> const str) const {
+    for (size_t n = str.find_first_of(target); n != str.npos;
+         n = str.find_first_of(target, n + 1)) {
+      if (not accepts_at(str, n)) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+}
+
+namespace jvalidate::format::detail {
+constexpr std::u32string_view g_exception_chars =
+    U"\u00B7\u00DF\u0375\u03C2\u05F3\u05F4\u0640\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667"
+    U"\u0668\u0669\u06F0\u06F1\u06F2\u06F3\u06F4\u06F5\u06F6\u06F7\u06F8\u06F9\u06FD\u06FE\u07FA"
+    U"\u0F0B\u3007\u302E\u302F\u3031\u3032\u3033\u3034\u3035\u303B\u30FB";
+constexpr std::u32string_view g_exception_disallowed_chars =
+    U"\u0640\u07FA\u302E\u302F\u3031\u3032\u3033\u3034\u3035\u303B";
+
+/**
+ * @brief Determine if the character is DISALLOWED by special case in the
+ * Exceptions (https://datatracker.ietf.org/doc/html/rfc5892#section-2.6)
+ * table.
+ */
+inline bool is_not_disallowed_exception(char32_t c) {
+  return g_exception_disallowed_chars.find(c) == std::u32string_view::npos;
+}
+
+/**
+ * @brief Determine if the character is in the "Greek" character range.
+ */
+inline bool is_greek(char32_t c) { return c >= U'\u0370' && c <= U'\u03FF'; }
+
+/**
+ * @brief Determine if the character is in the "Hebrew" character range.
+ */
+inline bool is_hebrew(char32_t c) { return c >= U'\u0590' && c <= U'\u05FF'; }
+
+/**
+ * @brief Determine if the character is in the "Han" (Kanji), "Hiragana", or
+ * "Katakana" character ranges, excepting "KATAKANA MIDDLE DOT".
+ */
+inline bool is_jp(char32_t c) {
+  using P = std::pair<char32_t, char32_t>;
+  constexpr std::array range{P{U'\u3040', U'\u30FA'},         P{U'\u30FC', U'\u30FF'},
+                             P{U'\u4E00', U'\u9FFF'},         P{U'\u3400', U'\u4DBF'},
+                             P{U'\U00020000', U'\U0002A6DF'}, P{U'\U0002A700', U'\U0002EBEF'},
+                             P{U'\U00030000', U'\U000323AF'}, P{U'\U0002EBF0', U'\U0002EE5F'},
+                             P{U'\U000323B0', U'\U0003347F'}, P{U'\uF900', U'\uFAFF'},
+                             P{U'\u2E80', U'\u303F'},         P{U'\u31C0', U'\u31EF'}};
+  return std::ranges::any_of(range, [c](P p) { return c >= p.first && c <= p.second; });
+  return (c >= U'\u3040' && c <= U'\u30FF') || (c >= U'\u4e00' && c <= U'\u9fff');
+}
+
+/**
+ * @brief Determine if the character is an ASCII 'l' - required for handling
+ * "MIDDLE DOT".
+ */
+inline bool is_l_char(char32_t c) { return c == 'l'; }
+}
+
+namespace jvalidate::format::detail {
+template <auto F> constexpr auto char_before(std::u32string_view str, size_t n) {
+  return n != 0 && F(str[n - 1]);
+}
+
+template <auto F> constexpr auto char_after(std::u32string_view str, size_t n) {
+  return (n != str.size() - 1) && F(str[n + 1]);
+}
+
+template <auto F> constexpr auto before_or_after(std::u32string_view str, size_t n) {
+  return char_before<F>(str, n) || char_after<F>(str, n);
+}
+
+template <auto F> constexpr auto before_and_after(std::u32string_view str, size_t n) {
+  return char_before<F>(str, n) && char_after<F>(str, n);
+}
+
+template <auto F> constexpr auto any_other_char(std::u32string_view str, size_t n) {
+  return std::any_of(str.begin(), str.begin() + n, F) ||
+         std::any_of(str.begin() + n + 1, str.end(), F);
+}
+}
+
+namespace jvalidate::format::detail {
+using std::string_view_literals::operator""sv;
+template <typename CharT> struct char_delimiters;
+
+template <> struct char_delimiters<char> {
+  static constexpr std::string_view dotdot{".."};
+  static constexpr std::string_view illegal_hostname_chars;
+  static constexpr std::string_view punycode_prefix{"xn--"};
+  static constexpr std::string_view illegal_dashes_ulabel{"--"};
+  static constexpr std::array<special_case<char>, 0> special_cases;
+};
+
+template <> struct char_delimiters<char32_t> {
+  static constexpr std::u32string_view dotdot{U".."};
+  static constexpr std::u32string_view punycode_prefix{U"xn--"};
+  static constexpr std::u32string_view illegal_dashes_ulabel{U"--"};
+  static constexpr std::u32string_view illegal_hostname_chars{U"\u302E"};
+  static constexpr std::array special_cases{
+      special_case{U"\u0375"sv, char_after<is_greek>},
+      special_case{U"\u05f3"sv, char_before<is_hebrew>},
+      special_case{U"\u05f4"sv, char_before<is_hebrew>},
+      special_case{U"\u00b7"sv, before_and_after<is_l_char>},
+      special_case{U"\u30fb"sv, any_other_char<is_jp>},
+      special_case{g_exception_chars, before_or_after<is_not_disallowed_exception>},
+  };
+};
+}

+ 19 - 0
include/jvalidate/detail/string.h

@@ -4,7 +4,9 @@
  */
 #pragma once
 
+#include <ostream>
 #include <string>
+#include <string_view>
 #if __has_include(<unicode/std_string.h>)
 #define JVALIDATE_HAS_ICU
 #include <unicode/brkiter.h>
@@ -88,6 +90,17 @@ inline std::string regex_escape(std::string_view arg) {
   return rval.toUTF8String(out);
 }
 
+inline std::string_view to_u8(std::string_view arg) { return arg; }
+
+inline std::string to_u8(std::u32string_view arg) {
+  icu::UnicodeString const ucs =
+      icu::UnicodeString::fromUTF32(reinterpret_cast<int const *>(arg.data()), arg.size());
+  std::string out;
+  return ucs.toUTF8String(out);
+}
+
+inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
+
 inline std::u32string to_u32(std::string_view arg) {
   icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
 
@@ -105,5 +118,11 @@ inline std::u32string to_u32(std::string_view arg) {
   return rval;
 }
 }
+
+namespace std {
+inline std::ostream & operator<<(std::ostream & os, std::u32string_view str) {
+  return os << jvalidate::detail::to_u8(str);
+}
+}
 #else
 #endif

+ 66 - 23
include/jvalidate/format.h

@@ -9,6 +9,13 @@
 #include <unordered_map>
 #include <utility>
 
+#if __has_include(<ada/idna/to_unicode.h>)
+#define JVALIDATE_HAS_IDNA
+#include <ada/idna/to_unicode.h>
+#include <ada/idna/validity.h>
+#endif
+
+#include <jvalidate/detail/idna_special_cases.h>
 #include <jvalidate/detail/pointer.h>
 #include <jvalidate/detail/relative_pointer.h>
 #include <jvalidate/detail/string.h>
@@ -18,8 +25,6 @@
 #define UTF32(FN) format::utf32<format::FN<char32_t>>
 
 namespace jvalidate::format::detail {
-using namespace jvalidate::detail;
-
 struct result {
   ptrdiff_t consumed;
   bool valid;
@@ -151,31 +156,70 @@ inline bool duration(std::string_view dur) {
   return dur.empty();
 }
 
+template <typename CharT> bool is_invalid_host_char(CharT c) {
+  return c != '-' && not(std::isalnum(c) || c > 0x7F);
+}
+
+template <typename CharT>
+bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
+  using delim = detail::char_delimiters<CharT>;
+  return (name.empty() || detail::to_u8(name).size() >= 64 ||
+          (name.size() >= 4 && name.substr(2).starts_with(delim::illegal_dashes_ulabel)) ||
+          name[0] == '-' || name.back() == '-');
+}
+
+#ifndef JVALIDATE_HAS_IDNA
+inline bool hostname_part(std::string_view name) {
+  using delim = detail::char_delimiters<char>;
+
+  if (is_invalid_size_or_boundary_hostname(name)) {
+    return false;
+  }
+  return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
+}
+#else
+template <typename CharT> inline bool hostname_part(std::basic_string_view<CharT> name) {
+  using delim = detail::char_delimiters<CharT>;
+  if (name.starts_with(delim::punycode_prefix)) {
+    std::u32string decoded = detail::to_u32(ada::idna::to_unicode(detail::to_u8(name)));
+    return (decoded != detail::to_u32(name)) && hostname_part<char32_t>(decoded);
+  }
+
+  if (is_invalid_size_or_boundary_hostname(name)) {
+    return false;
+  }
+  if constexpr (std::is_same_v<char, CharT>) {
+    return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
+  } else {
+    return ada::idna::is_label_valid(name);
+  }
+}
+#endif
+
 // Limitation - does not inspect graphemes, so it cannot check idn-hostname
 // to fix this - we'd need to
 template <typename CharT = char> inline bool hostname(std::basic_string_view<CharT> name) {
-  auto hostname_part = [&name](size_t end) {
-    if (end == 0 || end >= 64 || name[0] == '-' || name[end - 1] == '-') {
-      return false;
-    }
-    for (size_t i = 0; i < end; ++i) {
-      if (name[i] != '-' && not std::isalnum(name[i])) {
-        return false;
-      }
-    }
-    return true;
-  };
+  using delim = detail::char_delimiters<CharT>;
+  if (name.find_first_of(delim::illegal_hostname_chars) != name.npos) {
+    return false;
+  }
 
-  if (name.size() > (name.back() == '.' ? 254 : 253)) {
+  if (detail::to_u8(name).size() > (name.back() == '.' ? 254 : 253)) {
     return false;
   }
+
+  if (not std::ranges::all_of(delim::special_cases,
+                              [name](auto & sc) { return sc.accepts(name); })) {
+    return false;
+  }
+
   for (size_t n = name.find('.'); n != std::string::npos;
        name.remove_prefix(n + 1), n = name.find('.')) {
-    if (not hostname_part(n)) {
+    if (not hostname_part(name.substr(0, n))) {
       return false;
     }
   }
-  return name.empty() || hostname_part(name.size());
+  return name.empty() || hostname_part(name);
 }
 
 inline bool ipv4(std::string_view ip) {
@@ -190,7 +234,7 @@ inline bool ipv4(std::string_view ip) {
   if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
     return false;
   }
-  if (sscanf(ip.data(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
+  if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
     return false;
   }
   return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
@@ -252,6 +296,7 @@ inline bool ipv6(std::string_view ip) {
 // complex grammar - as long as it has an '@' sign with at least one character
 // on each side, we ought to call it an email.
 template <typename CharT = char> inline bool email(std::basic_string_view<CharT> em) {
+  using delim = detail::char_delimiters<CharT>;
   size_t n = em.find_last_of('@');
   if (n == 0 || n >= em.size() - 1) {
     return false;
@@ -262,7 +307,7 @@ template <typename CharT = char> inline bool email(std::basic_string_view<CharT>
     // No validation
   } else if (who.starts_with('.') || who.ends_with('.')) {
     return false;
-  } else if (CharT const dots[3] = {'.', '.', '\0'}; em.substr(0, n).find(dots) != em.npos) {
+  } else if (em.substr(0, n).find(delim::dotdot) != em.npos) {
     return false;
   }
 
@@ -273,13 +318,11 @@ template <typename CharT = char> inline bool email(std::basic_string_view<CharT>
   domain.remove_prefix(1);
   domain.remove_suffix(1);
 
-  std::string ip(domain.size(), '\0'); // Re-acquiring the NULL terminator
-  std::ranges::copy(domain, ip.begin());
-
-  if (ip.starts_with("IPv6:")) {
+  if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
     return ipv6(ip.substr(5));
+  } else {
+    return ipv4(ip);
   }
-  return ipv4(ip);
 }
 
 template <typename T> inline bool ctor_as_valid(std::string_view str) {

+ 5 - 0
include/jvalidate/forward.h

@@ -12,6 +12,11 @@
 
 #define COMMA_NAME(X) , X
 
+namespace jvalidate::detail {}
+namespace jvalidate::format::detail {
+using namespace jvalidate::detail;
+}
+
 namespace jvalidate {
 class Schema;
 class Status;

+ 1 - 0
thirdparty/idna

@@ -0,0 +1 @@
+Subproject commit f53b5ec93f40fd6293bf206db0bddde686101f3f