|
|
@@ -1,16 +1,72 @@
|
|
|
#pragma once
|
|
|
+#include <cstdio>
|
|
|
+#include <functional>
|
|
|
+#include <jvalidate/_macro.h>
|
|
|
|
|
|
#include <cctype>
|
|
|
+#include <chrono>
|
|
|
#include <cstddef>
|
|
|
+#include <cstring>
|
|
|
#include <ctime>
|
|
|
-#include <iostream>
|
|
|
#include <string>
|
|
|
+#include <string_view>
|
|
|
+#include <system_error>
|
|
|
#include <unordered_map>
|
|
|
+#include <unordered_set>
|
|
|
#include <utility>
|
|
|
|
|
|
+#if JVALIDATE_HAS_IDNA
|
|
|
+#include <ada/idna/to_unicode.h>
|
|
|
+#include <ada/idna/validity.h>
|
|
|
+#endif
|
|
|
+
|
|
|
+#include <jvalidate/detail/expect.h>
|
|
|
+#include <jvalidate/detail/idna_special_cases.h>
|
|
|
+#include <jvalidate/detail/pointer.h>
|
|
|
+#include <jvalidate/detail/relative_pointer.h>
|
|
|
+#include <jvalidate/detail/string.h>
|
|
|
+#include <jvalidate/enum.h>
|
|
|
#include <jvalidate/forward.h>
|
|
|
|
|
|
+#define CONSTRUCTS(TYPE) format::ctor_as_valid<detail::TYPE>
|
|
|
+
|
|
|
+#define UTF32(FN) JVALIDATE_IIF(JVALIDATE_HAS_IDNA, format::utf32<format::FN<char32_t>>, nullptr)
|
|
|
+
|
|
|
+namespace jvalidate::format {
|
|
|
+bool date(std::string_view dt);
|
|
|
+bool time(std::string_view dt);
|
|
|
+bool date_time(std::string_view dt);
|
|
|
+bool duration(std::string_view dur);
|
|
|
+
|
|
|
+template <typename CharT = char> bool uri(std::basic_string_view<CharT> uri);
|
|
|
+template <typename CharT = char> bool uri_reference(std::basic_string_view<CharT> uri);
|
|
|
+bool uri_template(std::u32string_view uri);
|
|
|
+bool uuid(std::string_view id);
|
|
|
+template <typename CharT = char> bool hostname(std::basic_string_view<CharT> name);
|
|
|
+
|
|
|
+bool ipv4(std::string_view ip);
|
|
|
+bool ipv6(std::string_view ip);
|
|
|
+
|
|
|
+template <typename CharT = char> bool email(std::basic_string_view<CharT> em);
|
|
|
+}
|
|
|
+
|
|
|
namespace jvalidate::format::detail {
|
|
|
+inline bool is_dec(std::string_view s, size_t min = 0, size_t max = std::string_view::npos) {
|
|
|
+ constexpr char const * g_dec_digits = "0123456789";
|
|
|
+ return s.find_first_not_of(g_dec_digits) == std::string::npos && s.size() >= min &&
|
|
|
+ s.size() <= max;
|
|
|
+}
|
|
|
+
|
|
|
+inline bool is_hex(std::string_view s) {
|
|
|
+ constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
|
|
|
+ return s.find_first_not_of(g_hex_digits) == std::string::npos;
|
|
|
+}
|
|
|
+
|
|
|
+struct result {
|
|
|
+ ptrdiff_t consumed;
|
|
|
+ bool valid;
|
|
|
+};
|
|
|
+
|
|
|
inline bool is_leapyear(int y) { return (y % 400) == 0 || ((y % 4) == 0 && (y % 100) != 0); }
|
|
|
|
|
|
inline bool illegal_date(int y, int m, int d) {
|
|
|
@@ -21,26 +77,228 @@ inline bool illegal_date(int y, int m, int d) {
|
|
|
return d > days[m];
|
|
|
}
|
|
|
|
|
|
-inline auto date(std::string_view dt) {
|
|
|
+inline result date(std::string_view dt) {
|
|
|
struct tm tm;
|
|
|
if (auto end = strptime(dt.data(), "%Y-%m-%d", &tm); end) {
|
|
|
if ((end - dt.data()) != 10 || illegal_date(tm.tm_year + 1900, tm.tm_mon, tm.tm_mday)) {
|
|
|
- return std::make_pair(0L, false);
|
|
|
+ return {.consumed = 0, .valid = false};
|
|
|
+ }
|
|
|
+ return {.consumed = end - dt.data(), .valid = true};
|
|
|
+ }
|
|
|
+ return {.consumed = 0L, .valid = false};
|
|
|
+}
|
|
|
+
|
|
|
+inline bool is_leapsecond(std::tm tm) {
|
|
|
+ if (tm.tm_sec != 60) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+#if __cpp_lib_chrono >= 201907L
|
|
|
+ tm.tm_isdst = -1;
|
|
|
+ std::chrono::seconds time(std::mktime(&tm));
|
|
|
+ auto const & leap_seconds = std::chrono::get_tzdb().leap_seconds;
|
|
|
+ return std::ranges::find(leap_seconds, time) != leap_seconds.end();
|
|
|
+#else
|
|
|
+ return false;
|
|
|
+#endif
|
|
|
+}
|
|
|
+
|
|
|
+// https://www.rfc-editor.org/rfc/rfc6570.html#section-1.5
|
|
|
+inline bool is_uschar(int c) {
|
|
|
+ using P = std::pair<int, int>;
|
|
|
+ constexpr std::array data{
|
|
|
+ P{0xA0, 0xD7FF}, P{0xF900, 0xFDCF}, P{0xFDF0, 0xFFEF}, P{0x10000, 0x1FFFD},
|
|
|
+ P{0x20000, 0x2FFFD}, P{0x30000, 0x3FFFD}, P{0x40000, 0x4FFFD}, P{0x50000, 0x5FFFD},
|
|
|
+ P{0x60000, 0x6FFFD}, P{0x70000, 0x7FFFD}, P{0x80000, 0x8FFFD}, P{0x90000, 0x9FFFD},
|
|
|
+ P{0xA0000, 0xAFFFD}, P{0xB0000, 0xBFFFD}, P{0xC0000, 0xCFFFD}, P{0xD0000, 0xDFFFD},
|
|
|
+ P{0xE0000, 0xEFFFD},
|
|
|
+ };
|
|
|
+ return std::ranges::any_of(data,
|
|
|
+ [c](auto & pair) { return c >= pair.first && c <= pair.second; });
|
|
|
+}
|
|
|
+
|
|
|
+template <typename CharT>
|
|
|
+inline bool is_pchar(std::basic_string_view<CharT> part, size_t & pos,
|
|
|
+ std::string_view extra_valid_chars = ":@") {
|
|
|
+ constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
|
|
|
+ if (std::isalnum(part[pos]) || is_uschar(part[pos]) ||
|
|
|
+ std::strchr("-._~!$&'()*+,;=", part[pos])) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if (part[pos] == '%') {
|
|
|
+ return pos + 2 < part.size() && std::strchr(g_hex_digits, part[++pos]) &&
|
|
|
+ std::strchr(g_hex_digits, part[++pos]);
|
|
|
+ }
|
|
|
+ return extra_valid_chars.find(part[pos]) != part.npos;
|
|
|
+};
|
|
|
+
|
|
|
+inline bool is_uri_template_literal(std::u32string_view part, size_t & pos) {
|
|
|
+ constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
|
|
|
+ if (part[pos] == '%') {
|
|
|
+ return pos + 2 < part.size() && std::strchr(g_hex_digits, part[++pos]) &&
|
|
|
+ std::strchr(g_hex_digits, part[++pos]);
|
|
|
+ }
|
|
|
+ return !std::strchr(R"( "'%<>\^`{|}`)", part[pos]) && part[pos] > 0x1F && part[pos] != 0x7F;
|
|
|
+}
|
|
|
+
|
|
|
+inline bool is_uri_template_varchar(std::u32string_view part, size_t & pos) {
|
|
|
+ constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
|
|
|
+ if (part[pos] == '%') {
|
|
|
+ return pos + 2 < part.size() && std::strchr(g_hex_digits, part[++pos]) &&
|
|
|
+ std::strchr(g_hex_digits, part[++pos]);
|
|
|
+ }
|
|
|
+ return std::isalnum(part[pos]) || part[pos] == '_';
|
|
|
+}
|
|
|
+
|
|
|
+inline bool is_uri_template_expression(std::u32string_view part) {
|
|
|
+ if (part.empty()) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (std::strchr("+#./;?&=,!@|", part[0])) {
|
|
|
+ part.remove_prefix(1);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (size_t pos = part.find(','); !part.empty();
|
|
|
+ part.remove_prefix(std::min(part.size(), pos)), pos = part.find(',')) {
|
|
|
+ std::u32string_view varspec = part.substr(0, pos);
|
|
|
+ std::u32string_view expand;
|
|
|
+ if (size_t const mod = varspec.find_first_of(U":*"); mod != varspec.npos) {
|
|
|
+ expand = varspec.substr(mod + 1);
|
|
|
+ varspec.remove_suffix(expand.size() + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (expand.empty() || expand == U"*") {
|
|
|
+ // No Modifier, or Explode
|
|
|
+ } else if (expand.size() > 4 || expand[0] == '0' ||
|
|
|
+ not std::ranges::all_of(expand, [](char c) { return std::isdigit(c); })) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ for (size_t i = 0; i < varspec.size(); ++i) {
|
|
|
+ RETURN_UNLESS(is_uri_template_varchar(varspec, i) || (i > 0 && varspec[i] == '.'), false);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+template <typename CharT> bool is_uri_authority(std::basic_string_view<CharT> uri) {
|
|
|
+ // A URI Authority section MAY contain user info, which is every character up
|
|
|
+ // to the first "@" character, as long as that character is not part of the path
|
|
|
+ if (size_t pos = uri.find('@'); pos != uri.npos) {
|
|
|
+ for (size_t i = 0; i < pos; ++i) {
|
|
|
+ if (not is_pchar(uri, i, ":")) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ uri.remove_prefix(pos + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ // A URI Authority HOST section
|
|
|
+ // If the URI starts with '[', then it MUST BE an IPv6 or an "IPvFuture"
|
|
|
+ bool const has_ipv6 = (uri[0] == '[');
|
|
|
+ if (has_ipv6) {
|
|
|
+ size_t pos = uri.find(']');
|
|
|
+ auto ip = uri.substr(1, pos - 1);
|
|
|
+ uri.remove_prefix(pos + 1);
|
|
|
+ if (not ipv6(to_u8(ip))) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // A URI Authority PORT section. Technically allows any number of digits
|
|
|
+ if (size_t pos = uri.find(':'); pos != uri.npos) {
|
|
|
+ if (not std::ranges::all_of(uri.substr(pos + 1), [](auto c) { return std::isdigit(c); })) {
|
|
|
+ return false;
|
|
|
}
|
|
|
- return std::make_pair(end - dt.data(), true);
|
|
|
+ uri.remove_suffix(uri.size() - pos + 1);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Normal URI Authority HOST section is either an IPv4 or a HOSTNAME
|
|
|
+ // if we had an ipv6 part, we can permit an empty string (since hostname
|
|
|
+ // no longer permits them).
|
|
|
+ return (has_ipv6 && uri.empty()) || ipv4(to_u8(uri)) || hostname(uri);
|
|
|
+}
|
|
|
+
|
|
|
+// Tests if a URI "Query Part" or "Fragment Part" is valid and remove the part
|
|
|
+template <typename CharT> bool test_uri_part(std::basic_string_view<CharT> & uri, char delim) {
|
|
|
+ size_t const pos = uri.find(delim);
|
|
|
+ if (pos == uri.npos) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ auto part = uri.substr(pos + 1);
|
|
|
+ uri = uri.substr(0, pos);
|
|
|
+ for (size_t pos = 0; pos < part.size(); ++pos) {
|
|
|
+ RETURN_UNLESS(detail::is_pchar(part, pos, ":@/?"), false);
|
|
|
}
|
|
|
- return std::make_pair(0L, false);
|
|
|
+ return true;
|
|
|
+};
|
|
|
+}
|
|
|
+
|
|
|
+namespace jvalidate::format::draft03 {
|
|
|
+namespace detail = jvalidate::format::detail;
|
|
|
+
|
|
|
+inline bool time(std::string_view dt) {
|
|
|
+ std::tm tm;
|
|
|
+ char const * end = strptime(dt.data(), "%T", &tm);
|
|
|
+ if (end == nullptr || (end - dt.data()) < 8) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ return end == dt.end();
|
|
|
+}
|
|
|
+
|
|
|
+inline bool utc_millisec(std::string_view utc) {
|
|
|
+ int64_t itime;
|
|
|
+ if (auto [end, ec] = std::from_chars(utc.begin(), utc.end(), itime);
|
|
|
+ ec == std::errc{} && end == utc.end()) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ double dtime;
|
|
|
+ auto [end, ec] = std::from_chars(utc.begin(), utc.end(), dtime);
|
|
|
+ return ec == std::errc{} && end == utc.end();
|
|
|
+}
|
|
|
+
|
|
|
+inline bool css_2_1_color(std::string_view color) {
|
|
|
+ if (color.empty()) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
|
|
|
+ if (color[0] == '#') {
|
|
|
+ return color.size() <= 7 && detail::is_hex(color.substr(1));
|
|
|
+ }
|
|
|
+ static std::unordered_set<std::string_view> g_color_codes{
|
|
|
+ "maroon", "red", "orange", "yellow", "olive", "purple", "fuchsia", "white", "lime",
|
|
|
+ "green", "navy", "blue", "aqua", "teal", "black", "silver", "gray"};
|
|
|
+ return g_color_codes.contains(color);
|
|
|
+}
|
|
|
+
|
|
|
+inline bool e_123_phone(std::string_view phone) {
|
|
|
+ // https://support.secureauth.com/hc/en-us/articles/360036402211-Regular-Expressions-for-ITU-E-123-and-E-164-phone-number-formats
|
|
|
+ if (phone.empty()) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ if (phone[0] != '+') {
|
|
|
+ constexpr size_t g_usa_phone_tokens = 3;
|
|
|
+ char area[4], head[4], tail[5];
|
|
|
+ return sscanf(phone.data(), "(%3s) %3s %4s", area, head, tail) == g_usa_phone_tokens &&
|
|
|
+ detail::is_dec(area, 3) && detail::is_dec(head, 3) && detail::is_dec(tail, 4);
|
|
|
+ }
|
|
|
+ char tok0[4], tok1[4], tok2[4], tok3[5];
|
|
|
+ constexpr size_t g_i18n_phone_tokens = 4;
|
|
|
+ return sscanf(phone.data(), "+%3s %3s %3s %4s", tok0, tok1, tok2, tok3) == g_i18n_phone_tokens &&
|
|
|
+ detail::is_dec(tok0, 1, 3) && detail::is_dec(tok1, 2, 3) && detail::is_dec(tok2, 2, 3) &&
|
|
|
+ detail::is_dec(tok3, 4);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
namespace jvalidate::format {
|
|
|
inline bool date(std::string_view dt) {
|
|
|
- auto [size, good] = detail::date(dt);
|
|
|
- return good && size == dt.size();
|
|
|
+ auto [consumed, valid] = detail::date(dt);
|
|
|
+ return valid && consumed == dt.size();
|
|
|
}
|
|
|
|
|
|
inline bool time(std::string_view dt) {
|
|
|
- struct tm tm;
|
|
|
+ std::tm tm;
|
|
|
char const * end = strptime(dt.data(), "%T", &tm);
|
|
|
if (end == nullptr || end == dt.end() || (end - dt.data()) < 8) {
|
|
|
return false;
|
|
|
@@ -57,10 +315,10 @@ inline bool time(std::string_view dt) {
|
|
|
}
|
|
|
|
|
|
if (dt[0] == 'Z' || dt[0] == 'z') {
|
|
|
- return dt.size() == 1;
|
|
|
+ return dt.size() == 1 && detail::is_leapsecond(tm);
|
|
|
}
|
|
|
if (std::strchr("+-", dt[0])) {
|
|
|
- return strptime(dt.data() + 1, "%R", &tm) == dt.end();
|
|
|
+ return strptime(dt.data() + 1, "%R", &tm) == dt.end() && detail::is_leapsecond(tm);
|
|
|
}
|
|
|
return false;
|
|
|
}
|
|
|
@@ -74,18 +332,94 @@ inline bool date_time(std::string_view dt) {
|
|
|
return time(dt);
|
|
|
}
|
|
|
|
|
|
+template <typename CharT> inline bool uri(std::basic_string_view<CharT> uri) {
|
|
|
+ using delim = detail::char_delimiters<CharT>;
|
|
|
+
|
|
|
+ // https://www.rfc-editor.org/rfc/rfc3986.html#appendix-A
|
|
|
+ if (size_t const pos = uri.find(':'); pos != uri.npos) {
|
|
|
+ RETURN_UNLESS(std::isalpha(uri[0]), false);
|
|
|
+ for (size_t i = 1; i < pos; ++i) {
|
|
|
+ RETURN_UNLESS(std::isalnum(uri[i]) || std::strchr("+-.", uri[i]), false);
|
|
|
+ }
|
|
|
+ uri.remove_prefix(pos + 1);
|
|
|
+ } else {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ RETURN_UNLESS(detail::test_uri_part(uri, '#'), false);
|
|
|
+ RETURN_UNLESS(detail::test_uri_part(uri, '?'), false);
|
|
|
+
|
|
|
+ auto path = uri;
|
|
|
+ if (uri.starts_with(delim::double_slash)) {
|
|
|
+ uri.remove_prefix(2);
|
|
|
+ path = uri.substr(std::min(uri.size(), uri.find('/')));
|
|
|
+ uri.remove_suffix(path.size());
|
|
|
+ RETURN_UNLESS(detail::is_uri_authority(uri), false);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (size_t i = 0; i < path.size(); ++i) {
|
|
|
+ RETURN_UNLESS(detail::is_pchar(path, i, "/:@"), false);
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+template <typename CharT> inline bool uri_reference(std::basic_string_view<CharT> uri) {
|
|
|
+ using delim = detail::char_delimiters<CharT>;
|
|
|
+ if (jvalidate::format::uri(uri)) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ RETURN_UNLESS(detail::test_uri_part(uri, '#'), false);
|
|
|
+ RETURN_UNLESS(detail::test_uri_part(uri, '?'), false);
|
|
|
+
|
|
|
+ auto path = uri;
|
|
|
+ if (uri.starts_with(delim::double_slash)) {
|
|
|
+ uri.remove_prefix(2);
|
|
|
+ path = uri.substr(std::min(uri.size(), uri.find('/')));
|
|
|
+ uri.remove_suffix(path.size());
|
|
|
+ RETURN_UNLESS(detail::is_uri_authority(uri), false);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (size_t const pos = path.find('/'); pos != path.npos) {
|
|
|
+ for (size_t i = 0; i < pos; ++i) {
|
|
|
+ RETURN_UNLESS(detail::is_pchar(path, i, "@"), false);
|
|
|
+ }
|
|
|
+ path.remove_prefix(pos);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (size_t i = 0; i < path.size(); ++i) {
|
|
|
+ RETURN_UNLESS(detail::is_pchar(path, i, "/:@"), false);
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+inline bool uri_template(std::u32string_view uri) {
|
|
|
+ for (size_t i = 0; i < uri.size(); ++i) {
|
|
|
+ if (uri[i] != '{') {
|
|
|
+ RETURN_UNLESS(detail::is_uri_template_literal(uri, i), false);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ std::u32string_view expr = uri.substr(i + 1);
|
|
|
+ size_t const pos = expr.find('}');
|
|
|
+ RETURN_UNLESS(pos != uri.npos, false);
|
|
|
+ RETURN_UNLESS(detail::is_uri_template_expression(expr.substr(0, pos)), false);
|
|
|
+ i += pos + 1;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
inline bool uuid(std::string_view id) {
|
|
|
- constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
|
|
|
constexpr size_t g_uuid_len = 36;
|
|
|
constexpr size_t g_uuid_tokens = 5;
|
|
|
char tok0[9], tok1[5], tok2[5], tok3[5], tok4[13];
|
|
|
|
|
|
- auto is_hex = [](std::string_view s) {
|
|
|
- return s.find_first_not_of(g_hex_digits) == std::string::npos;
|
|
|
- };
|
|
|
return id.size() == g_uuid_len &&
|
|
|
sscanf(id.data(), "%8s-%4s-%4s-%4s-%12s", tok0, tok1, tok2, tok3, tok4) == g_uuid_tokens &&
|
|
|
- is_hex(tok0) && is_hex(tok1) && is_hex(tok2) && is_hex(tok3) && is_hex(tok4);
|
|
|
+ detail::is_hex(tok0) && detail::is_hex(tok1) && detail::is_hex(tok2) &&
|
|
|
+ detail::is_hex(tok3) && detail::is_hex(tok4);
|
|
|
}
|
|
|
|
|
|
inline bool duration(std::string_view dur) {
|
|
|
@@ -99,16 +433,26 @@ inline bool duration(std::string_view dur) {
|
|
|
return text.find(type);
|
|
|
};
|
|
|
|
|
|
+ // All DURATION entities must start with the prefix 'P', and cannot be empty
|
|
|
+ // past that point.
|
|
|
if (dur[0] != 'P' || dur.size() == 1) {
|
|
|
return false;
|
|
|
}
|
|
|
dur.remove_prefix(1);
|
|
|
|
|
|
+ // Special Case: a duration measured in weeks is incompatible with other
|
|
|
+ // duration tokens.
|
|
|
+ if (eat("W") != std::string::npos) {
|
|
|
+ return dur.empty();
|
|
|
+ }
|
|
|
+
|
|
|
+ // DURATION takes the following form, because we use the same token for both
|
|
|
+ // Months and Minutes.
|
|
|
+ // "P[#Y][#M][#D][T[#H][#M][#S]]".
|
|
|
+ // At least one of the optional fields must be present.
|
|
|
if (dur[0] != 'T') {
|
|
|
- if (eat("W") != std::string::npos) {
|
|
|
- return dur.empty();
|
|
|
- }
|
|
|
std::string_view ymd{"YMD"};
|
|
|
+ // Read YMD duration offsets in that order, allowing us to skip past them.
|
|
|
while (not ymd.empty() && not dur.empty()) {
|
|
|
if (size_t n = eat(ymd); n != std::string::npos) {
|
|
|
ymd.remove_prefix(n + 1);
|
|
|
@@ -121,12 +465,15 @@ inline bool duration(std::string_view dur) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // If we have a 'T' prefix for Hour/Minute/Second offsets, we must have at
|
|
|
+ // least one of them present.
|
|
|
if (dur[0] != 'T' || dur.size() == 1) {
|
|
|
return false;
|
|
|
}
|
|
|
dur.remove_prefix(1);
|
|
|
|
|
|
std::string_view hms{"HMS"};
|
|
|
+ // Read HMS duration offsets in that order, allowing us to skip past them.
|
|
|
while (not hms.empty() && not dur.empty()) {
|
|
|
if (size_t n = eat(hms); n != std::string::npos) {
|
|
|
hms.remove_prefix(n + 1);
|
|
|
@@ -137,66 +484,130 @@ inline bool duration(std::string_view dur) {
|
|
|
return dur.empty();
|
|
|
}
|
|
|
|
|
|
-// Limitation - does not inspect graphemes, so it cannot check idn-hostname
|
|
|
-// to fix this - we'd need to
|
|
|
-inline bool hostname(std::string_view name) {
|
|
|
- auto hostname_part = [&name](size_t end) {
|
|
|
- if (end == 0 || end >= 64 || name[0] == '-' || name[end - 1] == '-') {
|
|
|
- return false;
|
|
|
- }
|
|
|
- for (size_t i = 0; i < end; ++i) {
|
|
|
- if (name[i] != '-' && not std::isalnum(name[i])) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- }
|
|
|
- return true;
|
|
|
- };
|
|
|
+template <typename CharT>
|
|
|
+bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
|
|
|
+ using delim = detail::char_delimiters<CharT>;
|
|
|
+ return (name.empty() || name.length() >= 64 ||
|
|
|
+ (name.size() >= 4 && name.substr(2).starts_with(delim::illegal_dashes_ulabel)) ||
|
|
|
+ name[0] == '-' || name.back() == '-');
|
|
|
+}
|
|
|
+
|
|
|
+#if !JVALIDATE_HAS_IDNA
|
|
|
+inline bool hostname_part(std::string_view name) {
|
|
|
+ using delim = detail::char_delimiters<char>;
|
|
|
+
|
|
|
+ if (is_invalid_size_or_boundary_hostname(name)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
|
|
|
+}
|
|
|
+#else
|
|
|
+template <typename CharT> inline bool hostname_part(std::basic_string_view<CharT> name) {
|
|
|
+ using delim = detail::char_delimiters<CharT>;
|
|
|
+ // Punycode is a way to restructure UTF-8 strings to be ASCII compatibly
|
|
|
+ // All Punycode string start with "xn--" (and would therefore fail below).
|
|
|
+ if (name.starts_with(delim::punycode_prefix)) {
|
|
|
+ std::u32string decoded = detail::to_u32(ada::idna::to_unicode(detail::to_u8(name)));
|
|
|
+ return (decoded != detail::to_u32(name)) && hostname_part<char32_t>(decoded);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Unfortunately, the ada-idna library does not validate things like
|
|
|
+ // "is there a HEBREW character after the HEBREW COMMA".
|
|
|
+ if (not detail::is_special_case_ok(name)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (name.find_first_of(delim::illegal_hostname_chars) != name.npos) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ // An INVALID hostname part is one of the following:
|
|
|
+ // - empty
|
|
|
+ // - more than 63 characters long
|
|
|
+ // - starts or ends with a '-'
|
|
|
+ // - matches the regular expression /^..--.*$/
|
|
|
+ if (is_invalid_size_or_boundary_hostname(name)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ // This is a much easier check in hostname than idn-hostname, since we can
|
|
|
+ // just check for alphanumeric and '-'.
|
|
|
+ if constexpr (std::is_same_v<char, CharT>) {
|
|
|
+ return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
|
|
|
+ } else {
|
|
|
+ return ada::idna::is_label_valid(name);
|
|
|
+ }
|
|
|
+}
|
|
|
+#endif
|
|
|
|
|
|
- if (name.size() > (name.back() == '.' ? 254 : 253)) {
|
|
|
+template <typename CharT> inline bool hostname(std::basic_string_view<CharT> name) {
|
|
|
+ using delim = detail::char_delimiters<CharT>;
|
|
|
+ // In general, the maximum length of a hostname is 253 characters.
|
|
|
+ if (name.empty() || name.length() > 253) {
|
|
|
return false;
|
|
|
}
|
|
|
- for (size_t n = name.find('.'); n != std::string::npos;
|
|
|
- name.remove_prefix(n + 1), n = name.find('.')) {
|
|
|
- if (not hostname_part(n)) {
|
|
|
+
|
|
|
+ // We validate each sub-section of the hostname in parts, delimited by '.'
|
|
|
+ for (size_t n = name.find_first_of(delim::hostname_part_delims); n != std::string::npos;
|
|
|
+ name.remove_prefix(n + 1), n = name.find_first_of(delim::hostname_part_delims)) {
|
|
|
+ if (not hostname_part(name.substr(0, n))) {
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
- return name.empty() || hostname_part(name.size());
|
|
|
+
|
|
|
+ // Previous test versions allowed for a hostname to end with '.', but this is
|
|
|
+ // not permitted in the latest test specification.
|
|
|
+ return hostname_part(name);
|
|
|
}
|
|
|
|
|
|
inline bool ipv4(std::string_view ip) {
|
|
|
unsigned int ip0, ip1, ip2, ip3;
|
|
|
char eof;
|
|
|
- if (ip.find_first_not_of("0123456789.") != std::string::npos) {
|
|
|
+ // IPv4 address MAY only contain DIGITS and '.'
|
|
|
+ if (ip.find_first_not_of("0123456789.") != ip.npos) {
|
|
|
return false;
|
|
|
}
|
|
|
+
|
|
|
+ // Each OCTET of an IPv4 can only start with '0' if it is EXACTLY '0'
|
|
|
if (ip[0] == '0' && std::isdigit(ip[1])) {
|
|
|
return false;
|
|
|
}
|
|
|
- if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
|
|
|
+ if (size_t n = ip.find(".0"); n != ip.npos && std::isdigit(ip[n + 2])) {
|
|
|
return false;
|
|
|
}
|
|
|
- if (sscanf(ip.data(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
|
|
|
+
|
|
|
+ // sscanf returns the number of tokens parsed successfully.
|
|
|
+ // Therefore, we can add a trailing character output to the format-string
|
|
|
+ // and check that we failed to parse any token into the eof-character token.
|
|
|
+ if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
|
|
|
return false;
|
|
|
}
|
|
|
+ // Affirm that each OCTET is only two bytes wide.
|
|
|
return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
|
|
|
}
|
|
|
|
|
|
inline bool ipv6(std::string_view ip) {
|
|
|
int expected_spans = 8;
|
|
|
|
|
|
+ // There is a special rule with IPv6 to allow an IPv4 address as a suffix
|
|
|
if (size_t n = ip.find('.'); n != std::string::npos) {
|
|
|
if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
|
|
|
return false;
|
|
|
}
|
|
|
- // This is a cheat to allow e.g. ::127.0.0.1 to validate
|
|
|
+ // since ipv4 addresses contain 8 bytes of information, and each segment of
|
|
|
+ // an ipv6 address contains 4 bytes - we should reduce the number of
|
|
|
+ // expected spans to 6. Instead - we reduce it to 7 because we don't prune
|
|
|
+ // the first OCTET of the IPv4 section (as it can read as a valid segment).
|
|
|
expected_spans = 7;
|
|
|
ip = ip.substr(0, n);
|
|
|
}
|
|
|
|
|
|
+ // IPv6 address MAY only contain HEXDIGITs and ':'
|
|
|
if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
|
|
|
return false;
|
|
|
}
|
|
|
+ // IPv6 addresses can have a maximum of 39 characters (8 4-char HEXDIGIT
|
|
|
+ // segments with 7 dividing ':'s).
|
|
|
if (ip.size() >= 40) {
|
|
|
return false;
|
|
|
}
|
|
|
@@ -208,25 +619,33 @@ inline bool ipv6(std::string_view ip) {
|
|
|
has_compressed = true;
|
|
|
ip.remove_prefix(2);
|
|
|
}
|
|
|
- while (!ip.empty()) {
|
|
|
+
|
|
|
+ while (!ip.empty() && ++groups) {
|
|
|
int data;
|
|
|
if (sscanf(ip.data(), "%4x", &data) != 1) {
|
|
|
+ // Not a 4-byte HEXDIGIT. Not sure that it's ever possible due to the
|
|
|
+ // char filter above.
|
|
|
return false;
|
|
|
}
|
|
|
- if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
|
|
|
- return false;
|
|
|
+
|
|
|
+ if (size_t const n = ip.find(':'); std::min(n, ip.size()) > 4) {
|
|
|
+ return false; // Segment too wide
|
|
|
} else if (n != std::string::npos) {
|
|
|
ip.remove_prefix(n + 1);
|
|
|
} else {
|
|
|
- ip = "";
|
|
|
+ break; // End of String
|
|
|
}
|
|
|
- ++groups;
|
|
|
- if (ip[0] == ':') {
|
|
|
- if (std::exchange(has_compressed, true)) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- ip.remove_prefix(1);
|
|
|
+
|
|
|
+ // We removed the regular ':', so this is a check for a compression mark
|
|
|
+ if (ip[0] != ':') {
|
|
|
+ continue;
|
|
|
}
|
|
|
+ if (std::exchange(has_compressed, true)) {
|
|
|
+ // The above trick allows us to ensure that there is no more than one
|
|
|
+ // set of "::" compression tokens in this IPv6 adfress.
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ ip.remove_prefix(1);
|
|
|
}
|
|
|
|
|
|
return groups == expected_spans || (has_compressed && groups < expected_spans);
|
|
|
@@ -237,65 +656,135 @@ inline bool ipv6(std::string_view ip) {
|
|
|
// Therefore, there's no point in trying to validate things according to a
|
|
|
// complex grammar - as long as it has an '@' sign with at least one character
|
|
|
// on each side, we ought to call it an email.
|
|
|
-inline bool email(std::string_view em) {
|
|
|
- size_t n = em.find_last_of('@');
|
|
|
+template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
|
|
|
+ using delim = detail::char_delimiters<CharT>;
|
|
|
+ size_t const n = em.find_last_of('@');
|
|
|
if (n == 0 || n >= em.size() - 1) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
- if (em[0] == '"' && em[n - 1] == '"') {
|
|
|
+ auto const who = em.substr(0, n);
|
|
|
+ if (who.starts_with('"') && who.ends_with('"')) {
|
|
|
// No validation
|
|
|
- } else if (em.substr(0, n).find("..") != std::string::npos || em[n - 1] == '.' || em[0] == '.') {
|
|
|
+ } else if (who.starts_with('.') || who.ends_with('.')) {
|
|
|
+ return false;
|
|
|
+ } else if (em.substr(0, n).find(delim::dotdot) != em.npos) {
|
|
|
+ return false;
|
|
|
+ } else if (who.find('@') != em.npos) {
|
|
|
+ // This will catch multiple emails, but will gracefully ignore quote-escaped
|
|
|
+ // '@' characters in the name element.
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
- em.remove_prefix(n + 1);
|
|
|
- if (em.front() == '[' && em.back() == ']') {
|
|
|
- em.remove_prefix(1);
|
|
|
- em.remove_suffix(1);
|
|
|
- if (em.starts_with("IPv6:")) {
|
|
|
- return ipv6(std::string(em.substr(5)));
|
|
|
- }
|
|
|
- return ipv4(std::string(em)); // Re-acquire NULL-term
|
|
|
+ // The DOMAIN section of an email address MAY be either a HOSTNAME, or an
|
|
|
+ // IP Address surrounded in brackets.
|
|
|
+ auto domain = em.substr(n + 1);
|
|
|
+ if (not(domain.starts_with('[') && domain.ends_with(']'))) {
|
|
|
+ return hostname(domain);
|
|
|
+ }
|
|
|
+ domain.remove_prefix(1);
|
|
|
+ domain.remove_suffix(1);
|
|
|
+
|
|
|
+ // When the DOMAIN is an IPv6, it must start with "IPv6:" for some
|
|
|
+ // weird compatibility reason.
|
|
|
+ if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
|
|
|
+ return ipv6(ip.substr(5));
|
|
|
+ } else {
|
|
|
+ return ipv4(ip);
|
|
|
}
|
|
|
- return hostname(em);
|
|
|
}
|
|
|
+
|
|
|
+template <typename T> inline bool ctor_as_valid(std::string_view str) {
|
|
|
+ try {
|
|
|
+ [[maybe_unused]] auto _ = T(str);
|
|
|
+ return true;
|
|
|
+ } catch (std::exception const &) { return false; }
|
|
|
+}
|
|
|
+
|
|
|
+#if JVALIDATE_HAS_IDNA
|
|
|
+template <auto Predicate> bool utf32(std::string_view str) {
|
|
|
+ return Predicate(detail::to_u32(str));
|
|
|
+}
|
|
|
+#endif
|
|
|
}
|
|
|
|
|
|
namespace jvalidate {
|
|
|
class FormatValidator {
|
|
|
public:
|
|
|
- using Predicate = bool (*)(std::string_view);
|
|
|
+ using StatelessPredicate = bool (*)(std::string_view);
|
|
|
+ using Predicate = std::function<bool(std::string_view)>;
|
|
|
+ using UserDefinedFormats = std::unordered_map<std::string, Predicate>;
|
|
|
enum class Status { Unknown, Unimplemented, Valid, Invalid };
|
|
|
|
|
|
private:
|
|
|
- std::unordered_map<std::string, Predicate> supported_formats_{
|
|
|
+ // This isn't actually a user format, but we don't generate any special
|
|
|
+ // annotations for user-defined format codes, so it doesn't really matter that
|
|
|
+ // we're putting it here. It simply reduces the number of LoC when setting up.
|
|
|
+ std::unordered_map<std::string, Predicate> formats_{{"regex", nullptr}};
|
|
|
+
|
|
|
+ std::unordered_map<std::string, StatelessPredicate> builtin_formats_{
|
|
|
{"date", &format::date},
|
|
|
{"date-time", &format::date_time},
|
|
|
{"duration", &format::duration},
|
|
|
{"email", &format::email},
|
|
|
{"hostname", &format::hostname},
|
|
|
- {"idn-email", nullptr},
|
|
|
- {"idn-hostname", nullptr},
|
|
|
+ {"idn-email", UTF32(email)},
|
|
|
+ {"idn-hostname", UTF32(hostname)},
|
|
|
{"ipv4", &format::ipv4},
|
|
|
{"ipv6", &format::ipv6},
|
|
|
- {"iri", nullptr},
|
|
|
- {"iri-reference", nullptr},
|
|
|
- {"json-pointer", nullptr},
|
|
|
- {"relative-json-pointer", nullptr},
|
|
|
- /* {"regex", &detail::StdRegexEngine::is_valid}, */
|
|
|
+ {"iri", UTF32(uri)},
|
|
|
+ {"iri-reference", UTF32(uri_reference)},
|
|
|
+ {"json-pointer", CONSTRUCTS(Pointer)},
|
|
|
+ {"relative-json-pointer", CONSTRUCTS(RelativePointer)},
|
|
|
{"time", &format::time},
|
|
|
- {"uri", nullptr},
|
|
|
- {"uri-reference", nullptr},
|
|
|
+ {"uri", &format::uri},
|
|
|
+ {"uri-reference", &format::uri_reference},
|
|
|
+#if JVALIDATE_HAS_IDNA
|
|
|
+ {"uri-template", &format::utf32<format::uri_template>},
|
|
|
+#else
|
|
|
{"uri-template", nullptr},
|
|
|
+#endif
|
|
|
{"uuid", &format::uuid},
|
|
|
};
|
|
|
|
|
|
+ std::unordered_map<std::string, StatelessPredicate> draft03_formats_{
|
|
|
+ {"date", &format::date},
|
|
|
+ // One of the weird things about draft03 - date-time allows for timezone
|
|
|
+ // and fraction-of-second in the argument, but time only allows hh:mm:ss.
|
|
|
+ {"date-time", &format::date_time},
|
|
|
+ {"time", &format::draft03::time},
|
|
|
+ {"utc-millisec", &format::draft03::utc_millisec},
|
|
|
+ {"color", &format::draft03::css_2_1_color},
|
|
|
+ {"style", nullptr},
|
|
|
+ {"phone", &format::draft03::e_123_phone},
|
|
|
+ {"uri", &format::uri},
|
|
|
+ {"email", &format::email},
|
|
|
+ {"ip-address", &format::ipv4},
|
|
|
+ {"ipv6", &format::ipv6},
|
|
|
+ {"host-name", &format::hostname},
|
|
|
+ };
|
|
|
+
|
|
|
public:
|
|
|
FormatValidator() = default;
|
|
|
+ FormatValidator(Predicate is_regex) { formats_.insert_or_assign("regex", is_regex); }
|
|
|
+ FormatValidator(UserDefinedFormats const & formats, Predicate is_regex) : formats_(formats) {
|
|
|
+ formats_.insert_or_assign("regex", is_regex);
|
|
|
+ }
|
|
|
+
|
|
|
+ Status operator()(std::string const & format, schema::Version for_version,
|
|
|
+ std::string_view text) const {
|
|
|
+ auto const & supported =
|
|
|
+ for_version == schema::Version::Draft03 ? draft03_formats_ : builtin_formats_;
|
|
|
+ if (Status rval = (*this)(supported, format, text); rval != Status::Unknown) {
|
|
|
+ return rval;
|
|
|
+ }
|
|
|
+ return (*this)(formats_, format, text);
|
|
|
+ }
|
|
|
|
|
|
- Status operator()(std::string const & format, std::string_view text) const {
|
|
|
- if (auto it = supported_formats_.find(format); it != supported_formats_.end() && it->second) {
|
|
|
+private:
|
|
|
+ Status operator()(auto const & supported, std::string const & format,
|
|
|
+ std::string_view text) const {
|
|
|
+ if (auto it = supported.find(format); it != supported.end()) {
|
|
|
if (not it->second) {
|
|
|
return Status::Unimplemented;
|
|
|
}
|
|
|
@@ -305,3 +794,6 @@ public:
|
|
|
}
|
|
|
};
|
|
|
}
|
|
|
+
|
|
|
+#undef CONSTRUCTS
|
|
|
+#undef UTF32
|