vor 7 Monaten · aa196e3a5d
--- a/Makefile
+++ b/Makefile
@@ -26,8 +26,9 @@ TEST_OBJECTS := $(patsubst %.cxx, .build/%.o, $(TEST_SOURCES))
 
				 TEST_BINARIES := .build/bin/selfvalidate .build/bin/annotation_test .build/bin/extension_test
			
 
				 EXECUTE_TESTS := $(patsubst %, %.done, $(TEST_BINARIES))
			
 
				 
			
 
				-EXCLUDED_FORMAT_TESTS := color idn iri iru ip-address host-name
			
 
				-EXCLUDED_FORMAT_TESTS := $(shell printf ":*optional_format_%s*" $(EXCLUDED_TESTS) | cut -c2-)
			
 
				+EXCLUDED_FORMAT_TESTS := color ip_address host_name
			
 
				+EXCLUDED_FORMAT_TESTS := $(shell printf ":*optional_format_%s*" $(EXCLUDED_FORMAT_TESTS) | cut -c2-)
			
 
				+EXCLUDED_FORMAT_TESTS := Draft3/JsonSchema.TestSuite/optional_format_time:$(EXCLUDED_FORMAT_TESTS)
			
 
				 EXCLUDED_TESTS := content ecmascript_regex zeroTerminatedFloats non_bmp_regex
			
 
				 EXCLUDED_TESTS := $(shell printf ":*optional_%s" $(EXCLUDED_TESTS) | cut -c2-):$(EXCLUDED_FORMAT_TESTS)
			
 
				 EXCLUDED_TEST_CASES = "*leap second*"
			
--- a/include/jvalidate/format.h
+++ b/include/jvalidate/format.h
@@ -166,8 +166,10 @@ inline bool is_uri_template_expression(std::u32string_view part) {
 
				   return true;
			
 
				 }
			
 
				 
			
 
				-template <typename CharT> inline bool is_uri_authority(std::basic_string_view<CharT> uri) {
			
 
				-  if (size_t pos = uri.find('@'); pos != uri.npos && pos < uri.find('/')) {
			
 
				+template <typename CharT> bool is_uri_authority(std::basic_string_view<CharT> uri) {
			
 
				+  // A URI Authority section MAY contain user info, which is every character up
			
 
				+  // to the first "@" character, as long as that character is not part of the path
			
 
				+  if (size_t pos = uri.find('@'); pos != uri.npos) {
			
 
				     for (size_t i = 0; i < pos; ++i) {
			
 
				       if (not is_pchar(uri, i, ":")) {
			
 
				         return false;
			
@@ -175,6 +177,9 @@ template <typename CharT> inline bool is_uri_authority(std::basic_string_view<Ch
 
				     }
			
 
				     uri.remove_prefix(pos + 1);
			
 
				   }
			
 
				+
			
 
				+  // A URI Authority HOST section
			
 
				+  // If the URI starts with '[', then it MUST BE an IPv6 or an "IPvFuture"
			
 
				   if (uri[0] == '[') {
			
 
				     size_t pos = uri.find(']');
			
 
				     auto ip = uri.substr(1, pos - 1);
			
@@ -183,12 +188,16 @@ template <typename CharT> inline bool is_uri_authority(std::basic_string_view<Ch
 
				       return false;
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  // A URI Authority PORT section. Technically allows any number of digits
			
 
				   if (size_t pos = uri.find(':'); pos != uri.npos) {
			
 
				     if (not std::ranges::all_of(uri.substr(pos + 1), [](auto c) { return std::isdigit(c); })) {
			
 
				       return false;
			
 
				     }
			
 
				     uri.remove_suffix(uri.size() - pos + 1);
			
 
				   }
			
 
				+
			
 
				+  // Normal URI Authority HOST section is either an IPv4 or a HOSTNAME
			
 
				   return ipv4(to_u8(uri)) || hostname(uri);
			
 
				 }
			
 
				 
			
@@ -351,16 +360,26 @@ inline bool duration(std::string_view dur) {
 
				     return text.find(type);
			
 
				   };
			
 
				 
			
 
				+  // All DURATION entities must start with the prefix 'P', and cannot be empty
			
 
				+  // past that point.
			
 
				   if (dur[0] != 'P' || dur.size() == 1) {
			
 
				     return false;
			
 
				   }
			
 
				   dur.remove_prefix(1);
			
 
				 
			
 
				+  // Special Case: a duration measured in weeks is incompatible with other
			
 
				+  // duration tokens.
			
 
				+  if (eat("W") != std::string::npos) {
			
 
				+    return dur.empty();
			
 
				+  }
			
 
				+
			
 
				+  // DURATION takes the following form, because we use the same token for both
			
 
				+  // Months and Minutes.
			
 
				+  // "P[#Y][#M][#D][T[#H][#M][#S]]".
			
 
				+  // At least one of the optional fields must be present.
			
 
				   if (dur[0] != 'T') {
			
 
				-    if (eat("W") != std::string::npos) {
			
 
				-      return dur.empty();
			
 
				-    }
			
 
				     std::string_view ymd{"YMD"};
			
 
				+    // Read YMD duration offsets in that order, allowing us to skip past them.
			
 
				     while (not ymd.empty() && not dur.empty()) {
			
 
				       if (size_t n = eat(ymd); n != std::string::npos) {
			
 
				         ymd.remove_prefix(n + 1);
			
@@ -373,12 +392,15 @@ inline bool duration(std::string_view dur) {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  // If we have a 'T' prefix for Hour/Minute/Second offsets, we must have at
			
 
				+  // least one of them present.
			
 
				   if (dur[0] != 'T' || dur.size() == 1) {
			
 
				     return false;
			
 
				   }
			
 
				   dur.remove_prefix(1);
			
 
				 
			
 
				   std::string_view hms{"HMS"};
			
 
				+  // Read HMS duration offsets in that order, allowing us to skip past them.
			
 
				   while (not hms.empty() && not dur.empty()) {
			
 
				     if (size_t n = eat(hms); n != std::string::npos) {
			
 
				       hms.remove_prefix(n + 1);
			
@@ -389,10 +411,6 @@ inline bool duration(std::string_view dur) {
 
				   return dur.empty();
			
 
				 }
			
 
				 
			
 
				-template <typename CharT> bool is_invalid_host_char(CharT c) {
			
 
				-  return c != '-' && not(std::isalnum(c) || c > 0x7F);
			
 
				-}
			
 
				-
			
 
				 template <typename CharT>
			
 
				 bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
			
 
				   using delim = detail::char_delimiters<CharT>;
			
@@ -435,57 +453,79 @@ template <typename CharT> inline bool hostname(std::basic_string_view<CharT> nam
 
				     return false;
			
 
				   }
			
 
				 
			
 
				+  // In general, the maximum length of a hostname is 253 UTF-8 characters.
			
 
				   if (detail::to_u8(name).size() > (name.back() == '.' ? 254 : 253)) {
			
 
				     return false;
			
 
				   }
			
 
				 
			
 
				+  // Unfortunately, the ada-idna library does not validate things like
			
 
				+  // "is there a HEBREW character after the HEBREW COMMA".
			
 
				   if (not std::ranges::all_of(delim::special_cases,
			
 
				                               [name](auto & sc) { return sc.accepts(name); })) {
			
 
				     return false;
			
 
				   }
			
 
				 
			
 
				+  // We validate each sub-section of the hostname in parts, delimited by '.'
			
 
				   for (size_t n = name.find('.'); n != std::string::npos;
			
 
				        name.remove_prefix(n + 1), n = name.find('.')) {
			
 
				     if (not hostname_part(name.substr(0, n))) {
			
 
				       return false;
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  // name.empty() would be true only if the final character in the input name
			
 
				+  // was '.', this is the only empty hostname part that we allow. Otherwise, we
			
 
				+  // have a trailing hostname_part.
			
 
				   return name.empty() || hostname_part(name);
			
 
				 }
			
 
				 
			
 
				 inline bool ipv4(std::string_view ip) {
			
 
				   unsigned int ip0, ip1, ip2, ip3;
			
 
				   char eof;
			
 
				-  if (ip.find_first_not_of("0123456789.") != std::string::npos) {
			
 
				+  // IPv4 address MAY only contain DIGITS and '.'
			
 
				+  if (ip.find_first_not_of("0123456789.") != ip.npos) {
			
 
				     return false;
			
 
				   }
			
 
				+
			
 
				+  // Each OCTET of an IPv4 can only start with '0' if it is EXACTLY '0'
			
 
				   if (ip[0] == '0' && std::isdigit(ip[1])) {
			
 
				     return false;
			
 
				   }
			
 
				-  if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
			
 
				+  if (size_t n = ip.find(".0"); n != ip.npos && std::isdigit(ip[n + 2])) {
			
 
				     return false;
			
 
				   }
			
 
				+
			
 
				+  // sscanf returns the number of tokens parsed successfully.
			
 
				+  // Therefore, we can add a trailing character output to the format-string
			
 
				+  // and check that we failed to parse any token into the eof-character token.
			
 
				   if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
			
 
				     return false;
			
 
				   }
			
 
				+  // Affirm that each OCTET is only two bytes wide.
			
 
				   return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
			
 
				 }
			
 
				 
			
 
				 inline bool ipv6(std::string_view ip) {
			
 
				   int expected_spans = 8;
			
 
				 
			
 
				+  // There is a special rule with IPv6 to allow an IPv4 address as a suffix
			
 
				   if (size_t n = ip.find('.'); n != std::string::npos) {
			
 
				     if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
			
 
				       return false;
			
 
				     }
			
 
				-    // This is a cheat to allow e.g. ::127.0.0.1 to validate
			
 
				+    // since ipv4 addresses contain 8 bytes of information, and each segment of
			
 
				+    // an ipv6 address contains 4 bytes - we reduce the number of expected spans
			
 
				+    // to 6. This can even accept IPv6 things like "::127.0.0.1" as valid IPv6.
			
 
				     expected_spans = 7;
			
 
				     ip = ip.substr(0, n);
			
 
				   }
			
 
				 
			
 
				+  // IPv6 address MAY only contain HEXDIGITs and ':'
			
 
				   if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
			
 
				     return false;
			
 
				   }
			
 
				+  // IPv6 addresses can have a maximum of 39 characters (8 4-char HEXDIGIT
			
 
				+  // segments with 7 dividing ':'s).
			
 
				   if (ip.size() >= 40) {
			
 
				     return false;
			
 
				   }
			
@@ -497,25 +537,31 @@ inline bool ipv6(std::string_view ip) {
 
				     has_compressed = true;
			
 
				     ip.remove_prefix(2);
			
 
				   }
			
 
				-  while (!ip.empty()) {
			
 
				+
			
 
				+  while (!ip.empty() && ++groups) {
			
 
				     int data;
			
 
				     if (sscanf(ip.data(), "%4x", &data) != 1) {
			
 
				-      return false;
			
 
				+      return false; // Not a 4-byte HEXDIGIT
			
 
				     }
			
 
				-    if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
			
 
				-      return false;
			
 
				+
			
 
				+    if (size_t const n = ip.find(':'); std::min(n, ip.size()) > 4) {
			
 
				+      return false; // Segment too wide
			
 
				     } else if (n != std::string::npos) {
			
 
				       ip.remove_prefix(n + 1);
			
 
				     } else {
			
 
				-      ip = "";
			
 
				+      break; // End of String
			
 
				     }
			
 
				-    ++groups;
			
 
				-    if (ip[0] == ':') {
			
 
				-      if (std::exchange(has_compressed, true)) {
			
 
				-        return false;
			
 
				-      }
			
 
				-      ip.remove_prefix(1);
			
 
				+
			
 
				+    // We removed the regular ':', so this is a check for a compression mark
			
 
				+    if (ip[0] != ':') {
			
 
				+      continue;
			
 
				+    }
			
 
				+    if (std::exchange(has_compressed, true)) {
			
 
				+      // The above trick allows us to ensure that there is no more than one
			
 
				+      // set of "::" compression tokens in this IPv6 adfress.
			
 
				+      return false;
			
 
				     }
			
 
				+    ip.remove_prefix(1);
			
 
				   }
			
 
				 
			
 
				   return groups == expected_spans || (has_compressed && groups < expected_spans);
			
@@ -528,7 +574,7 @@ inline bool ipv6(std::string_view ip) {
 
				 // on each side, we ought to call it an email.
			
 
				 template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
			
 
				   using delim = detail::char_delimiters<CharT>;
			
 
				-  size_t n = em.find_last_of('@');
			
 
				+  size_t const n = em.find_last_of('@');
			
 
				   if (n == 0 || n >= em.size() - 1) {
			
 
				     return false;
			
 
				   }
			
@@ -542,6 +588,8 @@ template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
 
				     return false;
			
 
				   }
			
 
				 
			
 
				+  // The DOMAIN section of an email address MAY be either a HOSTNAME, or an
			
 
				+  // IP Address surrounded in brackets.
			
 
				   auto domain = em.substr(n + 1);
			
 
				   if (not(domain.starts_with('[') && domain.ends_with(']'))) {
			
 
				     return hostname(domain);
			
@@ -549,6 +597,8 @@ template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
 
				   domain.remove_prefix(1);
			
 
				   domain.remove_suffix(1);
			
 
				 
			
 
				+  // When the DOMAIN is an IPv6, it must start with "IPv6:" for some
			
 
				+  // weird compatibility reason.
			
 
				   if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
			
 
				     return ipv6(ip.substr(5));
			
 
				   } else {