Quellcode durchsuchen

chore: fix filtering of test cases

Sam Jaffe vor 7 Monaten
Ursprung
Commit
aa196e3a5d
2 geänderte Dateien mit 77 neuen und 26 gelöschten Zeilen
  1. 3 2
      Makefile
  2. 74 24
      include/jvalidate/format.h

+ 3 - 2
Makefile

@@ -26,8 +26,9 @@ TEST_OBJECTS := $(patsubst %.cxx, .build/%.o, $(TEST_SOURCES))
 TEST_BINARIES := .build/bin/selfvalidate .build/bin/annotation_test .build/bin/extension_test
 EXECUTE_TESTS := $(patsubst %, %.done, $(TEST_BINARIES))
 
-EXCLUDED_FORMAT_TESTS := color idn iri iru ip-address host-name
-EXCLUDED_FORMAT_TESTS := $(shell printf ":*optional_format_%s*" $(EXCLUDED_TESTS) | cut -c2-)
+EXCLUDED_FORMAT_TESTS := color ip_address host_name
+EXCLUDED_FORMAT_TESTS := $(shell printf ":*optional_format_%s*" $(EXCLUDED_FORMAT_TESTS) | cut -c2-)
+EXCLUDED_FORMAT_TESTS := Draft3/JsonSchema.TestSuite/optional_format_time:$(EXCLUDED_FORMAT_TESTS)
 EXCLUDED_TESTS := content ecmascript_regex zeroTerminatedFloats non_bmp_regex
 EXCLUDED_TESTS := $(shell printf ":*optional_%s" $(EXCLUDED_TESTS) | cut -c2-):$(EXCLUDED_FORMAT_TESTS)
 EXCLUDED_TEST_CASES = "*leap second*"

+ 74 - 24
include/jvalidate/format.h

@@ -166,8 +166,10 @@ inline bool is_uri_template_expression(std::u32string_view part) {
   return true;
 }
 
-template <typename CharT> inline bool is_uri_authority(std::basic_string_view<CharT> uri) {
-  if (size_t pos = uri.find('@'); pos != uri.npos && pos < uri.find('/')) {
+template <typename CharT> bool is_uri_authority(std::basic_string_view<CharT> uri) {
+  // A URI Authority section MAY contain user info, which is every character up
+  // to the first "@" character, as long as that character is not part of the path
+  if (size_t pos = uri.find('@'); pos != uri.npos) {
     for (size_t i = 0; i < pos; ++i) {
       if (not is_pchar(uri, i, ":")) {
         return false;
@@ -175,6 +177,9 @@ template <typename CharT> inline bool is_uri_authority(std::basic_string_view<Ch
     }
     uri.remove_prefix(pos + 1);
   }
+
+  // A URI Authority HOST section
+  // If the URI starts with '[', then it MUST BE an IPv6 or an "IPvFuture"
   if (uri[0] == '[') {
     size_t pos = uri.find(']');
     auto ip = uri.substr(1, pos - 1);
@@ -183,12 +188,16 @@ template <typename CharT> inline bool is_uri_authority(std::basic_string_view<Ch
       return false;
     }
   }
+
+  // A URI Authority PORT section. Technically allows any number of digits
   if (size_t pos = uri.find(':'); pos != uri.npos) {
     if (not std::ranges::all_of(uri.substr(pos + 1), [](auto c) { return std::isdigit(c); })) {
       return false;
     }
     uri.remove_suffix(uri.size() - pos + 1);
   }
+
+  // Normal URI Authority HOST section is either an IPv4 or a HOSTNAME
   return ipv4(to_u8(uri)) || hostname(uri);
 }
 
@@ -351,16 +360,26 @@ inline bool duration(std::string_view dur) {
     return text.find(type);
   };
 
+  // All DURATION entities must start with the prefix 'P', and cannot be empty
+  // past that point.
   if (dur[0] != 'P' || dur.size() == 1) {
     return false;
   }
   dur.remove_prefix(1);
 
+  // Special Case: a duration measured in weeks is incompatible with other
+  // duration tokens.
+  if (eat("W") != std::string::npos) {
+    return dur.empty();
+  }
+
+  // DURATION takes the following form, because we use the same token for both
+  // Months and Minutes.
+  // "P[#Y][#M][#D][T[#H][#M][#S]]".
+  // At least one of the optional fields must be present.
   if (dur[0] != 'T') {
-    if (eat("W") != std::string::npos) {
-      return dur.empty();
-    }
     std::string_view ymd{"YMD"};
+    // Read YMD duration offsets in that order, allowing us to skip past them.
     while (not ymd.empty() && not dur.empty()) {
       if (size_t n = eat(ymd); n != std::string::npos) {
         ymd.remove_prefix(n + 1);
@@ -373,12 +392,15 @@ inline bool duration(std::string_view dur) {
     }
   }
 
+  // If we have a 'T' prefix for Hour/Minute/Second offsets, we must have at
+  // least one of them present.
   if (dur[0] != 'T' || dur.size() == 1) {
     return false;
   }
   dur.remove_prefix(1);
 
   std::string_view hms{"HMS"};
+  // Read HMS duration offsets in that order, allowing us to skip past them.
   while (not hms.empty() && not dur.empty()) {
     if (size_t n = eat(hms); n != std::string::npos) {
       hms.remove_prefix(n + 1);
@@ -389,10 +411,6 @@ inline bool duration(std::string_view dur) {
   return dur.empty();
 }
 
-template <typename CharT> bool is_invalid_host_char(CharT c) {
-  return c != '-' && not(std::isalnum(c) || c > 0x7F);
-}
-
 template <typename CharT>
 bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
   using delim = detail::char_delimiters<CharT>;
@@ -435,57 +453,79 @@ template <typename CharT> inline bool hostname(std::basic_string_view<CharT> nam
     return false;
   }
 
+  // In general, the maximum length of a hostname is 253 UTF-8 characters.
   if (detail::to_u8(name).size() > (name.back() == '.' ? 254 : 253)) {
     return false;
   }
 
+  // Unfortunately, the ada-idna library does not validate things like
+  // "is there a HEBREW character after the HEBREW COMMA".
   if (not std::ranges::all_of(delim::special_cases,
                               [name](auto & sc) { return sc.accepts(name); })) {
     return false;
   }
 
+  // We validate each sub-section of the hostname in parts, delimited by '.'
   for (size_t n = name.find('.'); n != std::string::npos;
        name.remove_prefix(n + 1), n = name.find('.')) {
     if (not hostname_part(name.substr(0, n))) {
       return false;
     }
   }
+
+  // name.empty() would be true only if the final character in the input name
+  // was '.', this is the only empty hostname part that we allow. Otherwise, we
+  // have a trailing hostname_part.
   return name.empty() || hostname_part(name);
 }
 
 inline bool ipv4(std::string_view ip) {
   unsigned int ip0, ip1, ip2, ip3;
   char eof;
-  if (ip.find_first_not_of("0123456789.") != std::string::npos) {
+  // IPv4 address MAY only contain DIGITS and '.'
+  if (ip.find_first_not_of("0123456789.") != ip.npos) {
     return false;
   }
+
+  // Each OCTET of an IPv4 can only start with '0' if it is EXACTLY '0'
   if (ip[0] == '0' && std::isdigit(ip[1])) {
     return false;
   }
-  if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
+  if (size_t n = ip.find(".0"); n != ip.npos && std::isdigit(ip[n + 2])) {
     return false;
   }
+
+  // sscanf returns the number of tokens parsed successfully.
+  // Therefore, we can add a trailing character output to the format-string
+  // and check that we failed to parse any token into the eof-character token.
   if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
     return false;
   }
+  // Affirm that each OCTET is only two bytes wide.
   return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
 }
 
 inline bool ipv6(std::string_view ip) {
   int expected_spans = 8;
 
+  // There is a special rule with IPv6 to allow an IPv4 address as a suffix
   if (size_t n = ip.find('.'); n != std::string::npos) {
     if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
       return false;
     }
-    // This is a cheat to allow e.g. ::127.0.0.1 to validate
+    // since ipv4 addresses contain 8 bytes of information, and each segment of
+    // an ipv6 address contains 4 bytes - we reduce the number of expected spans
+    // to 6. This can even accept IPv6 things like "::127.0.0.1" as valid IPv6.
     expected_spans = 7;
     ip = ip.substr(0, n);
   }
 
+  // IPv6 address MAY only contain HEXDIGITs and ':'
   if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
     return false;
   }
+  // IPv6 addresses can have a maximum of 39 characters (8 4-char HEXDIGIT
+  // segments with 7 dividing ':'s).
   if (ip.size() >= 40) {
     return false;
   }
@@ -497,25 +537,31 @@ inline bool ipv6(std::string_view ip) {
     has_compressed = true;
     ip.remove_prefix(2);
   }
-  while (!ip.empty()) {
+
+  while (!ip.empty() && ++groups) {
     int data;
     if (sscanf(ip.data(), "%4x", &data) != 1) {
-      return false;
+      return false; // Not a 4-byte HEXDIGIT
     }
-    if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
-      return false;
+
+    if (size_t const n = ip.find(':'); std::min(n, ip.size()) > 4) {
+      return false; // Segment too wide
     } else if (n != std::string::npos) {
       ip.remove_prefix(n + 1);
     } else {
-      ip = "";
+      break; // End of String
     }
-    ++groups;
-    if (ip[0] == ':') {
-      if (std::exchange(has_compressed, true)) {
-        return false;
-      }
-      ip.remove_prefix(1);
+
+    // We removed the regular ':', so this is a check for a compression mark
+    if (ip[0] != ':') {
+      continue;
+    }
+    if (std::exchange(has_compressed, true)) {
+      // The above trick allows us to ensure that there is no more than one
+      // set of "::" compression tokens in this IPv6 adfress.
+      return false;
     }
+    ip.remove_prefix(1);
   }
 
   return groups == expected_spans || (has_compressed && groups < expected_spans);
@@ -528,7 +574,7 @@ inline bool ipv6(std::string_view ip) {
 // on each side, we ought to call it an email.
 template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
   using delim = detail::char_delimiters<CharT>;
-  size_t n = em.find_last_of('@');
+  size_t const n = em.find_last_of('@');
   if (n == 0 || n >= em.size() - 1) {
     return false;
   }
@@ -542,6 +588,8 @@ template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
     return false;
   }
 
+  // The DOMAIN section of an email address MAY be either a HOSTNAME, or an
+  // IP Address surrounded in brackets.
   auto domain = em.substr(n + 1);
   if (not(domain.starts_with('[') && domain.ends_with(']'))) {
     return hostname(domain);
@@ -549,6 +597,8 @@ template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
   domain.remove_prefix(1);
   domain.remove_suffix(1);
 
+  // When the DOMAIN is an IPv6, it must start with "IPv6:" for some
+  // weird compatibility reason.
   if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
     return ipv6(ip.substr(5));
   } else {