|
|
@@ -166,8 +166,10 @@ inline bool is_uri_template_expression(std::u32string_view part) {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-template <typename CharT> inline bool is_uri_authority(std::basic_string_view<CharT> uri) {
|
|
|
- if (size_t pos = uri.find('@'); pos != uri.npos && pos < uri.find('/')) {
|
|
|
+template <typename CharT> bool is_uri_authority(std::basic_string_view<CharT> uri) {
|
|
|
+ // A URI Authority section MAY contain user info, which is every character up
|
|
|
+ // to the first "@" character, as long as that character is not part of the path
|
|
|
+ if (size_t pos = uri.find('@'); pos != uri.npos) {
|
|
|
for (size_t i = 0; i < pos; ++i) {
|
|
|
if (not is_pchar(uri, i, ":")) {
|
|
|
return false;
|
|
|
@@ -175,6 +177,9 @@ template <typename CharT> inline bool is_uri_authority(std::basic_string_view<Ch
|
|
|
}
|
|
|
uri.remove_prefix(pos + 1);
|
|
|
}
|
|
|
+
|
|
|
+ // A URI Authority HOST section
|
|
|
+ // If the URI starts with '[', then it MUST BE an IPv6 or an "IPvFuture"
|
|
|
if (uri[0] == '[') {
|
|
|
size_t pos = uri.find(']');
|
|
|
auto ip = uri.substr(1, pos - 1);
|
|
|
@@ -183,12 +188,16 @@ template <typename CharT> inline bool is_uri_authority(std::basic_string_view<Ch
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ // A URI Authority PORT section. Technically allows any number of digits
|
|
|
if (size_t pos = uri.find(':'); pos != uri.npos) {
|
|
|
if (not std::ranges::all_of(uri.substr(pos + 1), [](auto c) { return std::isdigit(c); })) {
|
|
|
return false;
|
|
|
}
|
|
|
uri.remove_suffix(uri.size() - pos + 1);
|
|
|
}
|
|
|
+
|
|
|
+ // Normal URI Authority HOST section is either an IPv4 or a HOSTNAME
|
|
|
return ipv4(to_u8(uri)) || hostname(uri);
|
|
|
}
|
|
|
|
|
|
@@ -351,16 +360,26 @@ inline bool duration(std::string_view dur) {
|
|
|
return text.find(type);
|
|
|
};
|
|
|
|
|
|
+ // All DURATION entities must start with the prefix 'P', and cannot be empty
|
|
|
+ // past that point.
|
|
|
if (dur[0] != 'P' || dur.size() == 1) {
|
|
|
return false;
|
|
|
}
|
|
|
dur.remove_prefix(1);
|
|
|
|
|
|
+ // Special Case: a duration measured in weeks is incompatible with other
|
|
|
+ // duration tokens.
|
|
|
+ if (eat("W") != std::string::npos) {
|
|
|
+ return dur.empty();
|
|
|
+ }
|
|
|
+
|
|
|
+ // DURATION takes the following form, because we use the same token for both
|
|
|
+ // Months and Minutes.
|
|
|
+ // "P[#Y][#M][#D][T[#H][#M][#S]]".
|
|
|
+ // At least one of the optional fields must be present.
|
|
|
if (dur[0] != 'T') {
|
|
|
- if (eat("W") != std::string::npos) {
|
|
|
- return dur.empty();
|
|
|
- }
|
|
|
std::string_view ymd{"YMD"};
|
|
|
+ // Read YMD duration offsets in that order, allowing us to skip past them.
|
|
|
while (not ymd.empty() && not dur.empty()) {
|
|
|
if (size_t n = eat(ymd); n != std::string::npos) {
|
|
|
ymd.remove_prefix(n + 1);
|
|
|
@@ -373,12 +392,15 @@ inline bool duration(std::string_view dur) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ // If we have a 'T' prefix for Hour/Minute/Second offsets, we must have at
|
|
|
+ // least one of them present.
|
|
|
if (dur[0] != 'T' || dur.size() == 1) {
|
|
|
return false;
|
|
|
}
|
|
|
dur.remove_prefix(1);
|
|
|
|
|
|
std::string_view hms{"HMS"};
|
|
|
+ // Read HMS duration offsets in that order, allowing us to skip past them.
|
|
|
while (not hms.empty() && not dur.empty()) {
|
|
|
if (size_t n = eat(hms); n != std::string::npos) {
|
|
|
hms.remove_prefix(n + 1);
|
|
|
@@ -389,10 +411,6 @@ inline bool duration(std::string_view dur) {
|
|
|
return dur.empty();
|
|
|
}
|
|
|
|
|
|
-template <typename CharT> bool is_invalid_host_char(CharT c) {
|
|
|
- return c != '-' && not(std::isalnum(c) || c > 0x7F);
|
|
|
-}
|
|
|
-
|
|
|
template <typename CharT>
|
|
|
bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
|
|
|
using delim = detail::char_delimiters<CharT>;
|
|
|
@@ -435,57 +453,79 @@ template <typename CharT> inline bool hostname(std::basic_string_view<CharT> nam
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+ // In general, the maximum length of a hostname is 253 UTF-8 characters.
|
|
|
if (detail::to_u8(name).size() > (name.back() == '.' ? 254 : 253)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+ // Unfortunately, the ada-idna library does not validate things like
|
|
|
+ // "is there a HEBREW character after the HEBREW COMMA".
|
|
|
if (not std::ranges::all_of(delim::special_cases,
|
|
|
[name](auto & sc) { return sc.accepts(name); })) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+ // We validate each sub-section of the hostname in parts, delimited by '.'
|
|
|
for (size_t n = name.find('.'); n != std::string::npos;
|
|
|
name.remove_prefix(n + 1), n = name.find('.')) {
|
|
|
if (not hostname_part(name.substr(0, n))) {
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ // name.empty() would be true only if the final character in the input name
|
|
|
+ // was '.', this is the only empty hostname part that we allow. Otherwise, we
|
|
|
+ // have a trailing hostname_part.
|
|
|
return name.empty() || hostname_part(name);
|
|
|
}
|
|
|
|
|
|
inline bool ipv4(std::string_view ip) {
|
|
|
unsigned int ip0, ip1, ip2, ip3;
|
|
|
char eof;
|
|
|
- if (ip.find_first_not_of("0123456789.") != std::string::npos) {
|
|
|
+ // IPv4 address MAY only contain DIGITS and '.'
|
|
|
+ if (ip.find_first_not_of("0123456789.") != ip.npos) {
|
|
|
return false;
|
|
|
}
|
|
|
+
|
|
|
+ // Each OCTET of an IPv4 can only start with '0' if it is EXACTLY '0'
|
|
|
if (ip[0] == '0' && std::isdigit(ip[1])) {
|
|
|
return false;
|
|
|
}
|
|
|
- if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
|
|
|
+ if (size_t n = ip.find(".0"); n != ip.npos && std::isdigit(ip[n + 2])) {
|
|
|
return false;
|
|
|
}
|
|
|
+
|
|
|
+ // sscanf returns the number of tokens parsed successfully.
|
|
|
+ // Therefore, we can add a trailing character output to the format-string
|
|
|
+ // and check that we failed to parse any token into the eof-character token.
|
|
|
if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
|
|
|
return false;
|
|
|
}
|
|
|
+ // Affirm that each OCTET is only two bytes wide.
|
|
|
return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
|
|
|
}
|
|
|
|
|
|
inline bool ipv6(std::string_view ip) {
|
|
|
int expected_spans = 8;
|
|
|
|
|
|
+ // There is a special rule with IPv6 to allow an IPv4 address as a suffix
|
|
|
if (size_t n = ip.find('.'); n != std::string::npos) {
|
|
|
if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
|
|
|
return false;
|
|
|
}
|
|
|
- // This is a cheat to allow e.g. ::127.0.0.1 to validate
|
|
|
+ // since ipv4 addresses contain 8 bytes of information, and each segment of
|
|
|
+ // an ipv6 address contains 4 bytes - we reduce the number of expected spans
|
|
|
+ // to 6. This can even accept IPv6 things like "::127.0.0.1" as valid IPv6.
|
|
|
expected_spans = 7;
|
|
|
ip = ip.substr(0, n);
|
|
|
}
|
|
|
|
|
|
+ // IPv6 address MAY only contain HEXDIGITs and ':'
|
|
|
if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
|
|
|
return false;
|
|
|
}
|
|
|
+ // IPv6 addresses can have a maximum of 39 characters (8 4-char HEXDIGIT
|
|
|
+ // segments with 7 dividing ':'s).
|
|
|
if (ip.size() >= 40) {
|
|
|
return false;
|
|
|
}
|
|
|
@@ -497,25 +537,31 @@ inline bool ipv6(std::string_view ip) {
|
|
|
has_compressed = true;
|
|
|
ip.remove_prefix(2);
|
|
|
}
|
|
|
- while (!ip.empty()) {
|
|
|
+
|
|
|
+ while (!ip.empty() && ++groups) {
|
|
|
int data;
|
|
|
if (sscanf(ip.data(), "%4x", &data) != 1) {
|
|
|
- return false;
|
|
|
+ return false; // Not a 4-byte HEXDIGIT
|
|
|
}
|
|
|
- if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
|
|
|
- return false;
|
|
|
+
|
|
|
+ if (size_t const n = ip.find(':'); std::min(n, ip.size()) > 4) {
|
|
|
+ return false; // Segment too wide
|
|
|
} else if (n != std::string::npos) {
|
|
|
ip.remove_prefix(n + 1);
|
|
|
} else {
|
|
|
- ip = "";
|
|
|
+ break; // End of String
|
|
|
}
|
|
|
- ++groups;
|
|
|
- if (ip[0] == ':') {
|
|
|
- if (std::exchange(has_compressed, true)) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- ip.remove_prefix(1);
|
|
|
+
|
|
|
+ // We removed the regular ':', so this is a check for a compression mark
|
|
|
+ if (ip[0] != ':') {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (std::exchange(has_compressed, true)) {
|
|
|
+ // The above trick allows us to ensure that there is no more than one
|
|
|
+ // set of "::" compression tokens in this IPv6 adfress.
|
|
|
+ return false;
|
|
|
}
|
|
|
+ ip.remove_prefix(1);
|
|
|
}
|
|
|
|
|
|
return groups == expected_spans || (has_compressed && groups < expected_spans);
|
|
|
@@ -528,7 +574,7 @@ inline bool ipv6(std::string_view ip) {
|
|
|
// on each side, we ought to call it an email.
|
|
|
template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
|
|
|
using delim = detail::char_delimiters<CharT>;
|
|
|
- size_t n = em.find_last_of('@');
|
|
|
+ size_t const n = em.find_last_of('@');
|
|
|
if (n == 0 || n >= em.size() - 1) {
|
|
|
return false;
|
|
|
}
|
|
|
@@ -542,6 +588,8 @@ template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+ // The DOMAIN section of an email address MAY be either a HOSTNAME, or an
|
|
|
+ // IP Address surrounded in brackets.
|
|
|
auto domain = em.substr(n + 1);
|
|
|
if (not(domain.starts_with('[') && domain.ends_with(']'))) {
|
|
|
return hostname(domain);
|
|
|
@@ -549,6 +597,8 @@ template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
|
|
|
domain.remove_prefix(1);
|
|
|
domain.remove_suffix(1);
|
|
|
|
|
|
+ // When the DOMAIN is an IPv6, it must start with "IPv6:" for some
|
|
|
+ // weird compatibility reason.
|
|
|
if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
|
|
|
return ipv6(ip.substr(5));
|
|
|
} else {
|