format.h 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. #pragma once
  2. #include <jvalidate/_config.h>
  3. #include <jvalidate/_macro.h>
  4. /*
  5. NOLINTBEGIN(readability-identifier-length,
  6. bugprone-inc-dec-in-conditions,
  7. cppcoreguidelines-avoid-magic-numbers,
  8. bugprone-suspicious-stringview-data-usage,
  9. readability-implicit-bool-conversion,
  10. cppcoreguidelines-narrowing-conversions,
  11. readability-identifier-length)
  12. */
  13. #include <cstdint>
  14. #include <cstdio>
  15. #include <functional>
  16. #include <cctype>
  17. #include <chrono> // IWYU pragma: keep
  18. #include <cstddef>
  19. #include <cstring>
  20. #include <ctime>
  21. #include <string>
  22. #include <string_view>
  23. #include <system_error>
  24. #include <unordered_map>
  25. #include <unordered_set>
  26. #include <utility>
  27. #if JVALIDATE_HAS_IDNA
  28. #include <ada/idna/to_unicode.h>
  29. #include <ada/idna/validity.h>
  30. #endif
  31. #include <jvalidate/detail/expect.h>
  32. #include <jvalidate/detail/idna_special_cases.h>
  33. #include <jvalidate/detail/pointer.h>
  34. #include <jvalidate/detail/relative_pointer.h>
  35. #include <jvalidate/detail/string.h>
  36. #include <jvalidate/enum.h>
  37. #include <jvalidate/forward.h>
  38. #define CONSTRUCTS(TYPE) format::ctor_as_valid<detail::TYPE>
  39. #define UTF32(FN) JVALIDATE_IIF(JVALIDATE_HAS_IDNA, format::utf32<format::FN<char32_t>>, nullptr)
  40. namespace jvalidate::format {
  41. bool date(std::string_view dt);
  42. bool time(std::string_view dt);
  43. bool date_time(std::string_view dt);
  44. bool duration(std::string_view dur);
  45. template <typename CharT = char> bool uri(std::basic_string_view<CharT> uri);
  46. template <typename CharT = char> bool uri_reference(std::basic_string_view<CharT> uri);
  47. bool uri_template(std::u32string_view uri);
  48. bool uuid(std::string_view id);
  49. template <typename CharT = char> bool hostname(std::basic_string_view<CharT> name);
  50. bool ipv4(std::string_view ip);
  51. bool ipv6(std::string_view ip);
  52. template <typename CharT = char> bool email(std::basic_string_view<CharT> em);
  53. }
  54. namespace jvalidate::format::detail {
  55. inline bool is_dec(std::string_view s, size_t min = 0, size_t max = std::string_view::npos) {
  56. constexpr char const * g_dec_digits = "0123456789";
  57. return s.find_first_not_of(g_dec_digits) == std::string::npos && s.size() >= min &&
  58. s.size() <= max;
  59. }
  60. inline bool is_hex(std::string_view s) {
  61. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  62. return s.find_first_not_of(g_hex_digits) == std::string::npos;
  63. }
  64. struct Result {
  65. ptrdiff_t consumed;
  66. bool valid;
  67. };
  68. inline bool is_leapyear(int y) { return (y % 400) == 0 || ((y % 4) == 0 && (y % 100) != 0); }
  69. inline bool illegal_date(int y, int m, int d) {
  70. static constexpr std::array<int, 12> days = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
  71. if (is_leapyear(y) && m == 1) {
  72. --d;
  73. }
  74. return m >= 0 && m < days.size() && d > days.at(m);
  75. }
  76. inline Result date(std::string_view dt) {
  77. struct tm tm;
  78. if (char * end = strptime(dt.data(), "%Y-%m-%d", &tm); end) {
  79. if ((end - dt.data()) != 10 || illegal_date(tm.tm_year + 1900, tm.tm_mon, tm.tm_mday)) {
  80. return {.consumed = 0, .valid = false};
  81. }
  82. return {.consumed = end - dt.data(), .valid = true};
  83. }
  84. return {.consumed = 0L, .valid = false};
  85. }
  86. inline bool is_leapsecond(std::tm tm) {
  87. if (tm.tm_sec != 60) {
  88. return true; // NOLINT(readability-simplify-boolean-expr) See below...
  89. }
  90. #if __cpp_lib_chrono >= 201907L
  91. tm.tm_isdst = -1;
  92. std::chrono::seconds time(std::mktime(&tm));
  93. auto const & leap_seconds = std::chrono::get_tzdb().leap_seconds;
  94. return std::ranges::find(leap_seconds, time) != leap_seconds.end();
  95. #else
  96. return false;
  97. #endif
  98. }
  99. // https://www.rfc-editor.org/rfc/rfc6570.html#section-1.5
  100. inline bool is_uschar(int c) {
  101. using P = std::pair<int, int>;
  102. constexpr std::array data{
  103. P{0xA0, 0xD7FF}, P{0xF900, 0xFDCF}, P{0xFDF0, 0xFFEF}, P{0x10000, 0x1FFFD},
  104. P{0x20000, 0x2FFFD}, P{0x30000, 0x3FFFD}, P{0x40000, 0x4FFFD}, P{0x50000, 0x5FFFD},
  105. P{0x60000, 0x6FFFD}, P{0x70000, 0x7FFFD}, P{0x80000, 0x8FFFD}, P{0x90000, 0x9FFFD},
  106. P{0xA0000, 0xAFFFD}, P{0xB0000, 0xBFFFD}, P{0xC0000, 0xCFFFD}, P{0xD0000, 0xDFFFD},
  107. P{0xE0000, 0xEFFFD},
  108. };
  109. return std::ranges::any_of(data,
  110. [c](auto & pair) { return c >= pair.first && c <= pair.second; });
  111. }
  112. template <typename CharT>
  113. inline bool is_pchar(std::basic_string_view<CharT> part, size_t & pos,
  114. std::string_view extra_valid_chars = ":@") {
  115. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  116. if (std::isalnum(part[pos]) || is_uschar(part[pos]) ||
  117. std::strchr("-._~!$&'()*+,;=", part[pos])) {
  118. return true;
  119. }
  120. if (part[pos] == '%') {
  121. return pos + 2 < part.size() && std::strchr(g_hex_digits, part[++pos]) &&
  122. std::strchr(g_hex_digits, part[++pos]);
  123. }
  124. return extra_valid_chars.find(part[pos]) != std::basic_string_view<CharT>::npos;
  125. };
  126. inline bool is_uri_template_literal(std::u32string_view part, size_t & pos) {
  127. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  128. if (part[pos] == '%') {
  129. return pos + 2 < part.size() && std::strchr(g_hex_digits, part[++pos]) &&
  130. std::strchr(g_hex_digits, part[++pos]);
  131. }
  132. return !std::strchr(R"( "'%<>\^`{|}`)", part[pos]) && part[pos] > 0x1F && part[pos] != 0x7F;
  133. }
  134. inline bool is_uri_template_varchar(std::u32string_view part, size_t & pos) {
  135. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  136. if (part[pos] == '%') {
  137. return pos + 2 < part.size() && std::strchr(g_hex_digits, part[++pos]) &&
  138. std::strchr(g_hex_digits, part[++pos]);
  139. }
  140. return std::isalnum(part[pos]) || part[pos] == '_';
  141. }
  142. inline bool is_uri_template_expression(std::u32string_view part) {
  143. if (part.empty()) {
  144. return false;
  145. }
  146. if (std::strchr("+#./;?&=,!@|", part[0])) {
  147. part.remove_prefix(1);
  148. }
  149. for (size_t pos = part.find(','); !part.empty();
  150. part.remove_prefix(std::min(part.size(), pos)), pos = part.find(',')) {
  151. std::u32string_view varspec = part.substr(0, pos);
  152. std::u32string_view expand;
  153. if (size_t const mod = varspec.find_first_of(U":*"); mod != std::u32string_view::npos) {
  154. expand = varspec.substr(mod + 1);
  155. varspec.remove_suffix(expand.size() + 1);
  156. }
  157. if (expand.empty() || expand == U"*") {
  158. // No Modifier, or Explode
  159. } else if (expand.size() > 4 || expand[0] == '0' ||
  160. not std::ranges::all_of(expand, [](char c) { return std::isdigit(c); })) {
  161. return false;
  162. }
  163. for (size_t i = 0; i < varspec.size(); ++i) {
  164. JVALIDATE_RETURN_UNLESS(is_uri_template_varchar(varspec, i) || (i > 0 && varspec[i] == '.'),
  165. false);
  166. }
  167. }
  168. return true;
  169. }
  170. template <typename CharT> bool is_uri_authority(std::basic_string_view<CharT> uri) {
  171. // A URI Authority section MAY contain user info, which is every character up
  172. // to the first "@" character, as long as that character is not part of the path
  173. if (size_t pos = uri.find('@'); pos != uri.npos) {
  174. for (size_t i = 0; i < pos; ++i) {
  175. if (not is_pchar(uri, i, ":")) {
  176. return false;
  177. }
  178. }
  179. uri.remove_prefix(pos + 1);
  180. }
  181. // A URI Authority HOST section
  182. // If the URI starts with '[', then it MUST BE an IPv6 or an "IPvFuture"
  183. bool const has_ipv6 = (uri[0] == '[');
  184. if (has_ipv6) {
  185. size_t pos = uri.find(']');
  186. auto ip = uri.substr(1, pos - 1);
  187. uri.remove_prefix(pos + 1);
  188. if (not ipv6(to_u8(ip))) {
  189. return false;
  190. }
  191. }
  192. // A URI Authority PORT section. Technically allows any number of digits
  193. if (size_t pos = uri.find(':'); pos != uri.npos) {
  194. if (not std::ranges::all_of(uri.substr(pos + 1), [](auto c) { return std::isdigit(c); })) {
  195. return false;
  196. }
  197. uri.remove_suffix(uri.size() - pos + 1);
  198. }
  199. // Normal URI Authority HOST section is either an IPv4 or a HOSTNAME
  200. // if we had an ipv6 part, we can permit an empty string (since hostname
  201. // no longer permits them).
  202. return (has_ipv6 && uri.empty()) || ipv4(to_u8(uri)) || hostname(uri);
  203. }
  204. // Tests if a URI "Query Part" or "Fragment Part" is valid and remove the part
  205. template <typename CharT> bool test_uri_part(std::basic_string_view<CharT> & uri, char delim) {
  206. size_t const pos = uri.find(delim);
  207. if (pos == uri.npos) {
  208. return true;
  209. }
  210. auto part = uri.substr(pos + 1);
  211. uri = uri.substr(0, pos);
  212. for (size_t pos = 0; pos < part.size(); ++pos) {
  213. JVALIDATE_RETURN_UNLESS(detail::is_pchar(part, pos, ":@/?"), false);
  214. }
  215. return true;
  216. };
  217. }
  218. namespace jvalidate::format::draft03 {
  219. namespace detail = jvalidate::format::detail;
  220. inline bool time(std::string_view dt) {
  221. std::tm tm;
  222. char const * end = strptime(dt.data(), "%T", &tm);
  223. if (end == nullptr || (end - dt.data()) < 8) {
  224. return false;
  225. }
  226. return end == dt.end();
  227. }
  228. inline bool utc_millisec(std::string_view utc) {
  229. int64_t itime = 0;
  230. if (auto [end, ec] = std::from_chars(utc.begin(), utc.end(), itime);
  231. ec == std::errc{} && end == utc.end()) {
  232. return true;
  233. }
  234. double dtime = 0.0;
  235. auto [end, ec] = std::from_chars(utc.begin(), utc.end(), dtime);
  236. return ec == std::errc{} && end == utc.end();
  237. }
  238. inline bool css_2_1_color(std::string_view color) {
  239. if (color.empty()) {
  240. return false;
  241. }
  242. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  243. if (color[0] == '#') {
  244. return color.size() <= 7 && detail::is_hex(color.substr(1));
  245. }
  246. static std::unordered_set<std::string_view> g_color_codes{
  247. "maroon", "red", "orange", "yellow", "olive", "purple", "fuchsia", "white", "lime",
  248. "green", "navy", "blue", "aqua", "teal", "black", "silver", "gray"};
  249. return g_color_codes.contains(color);
  250. }
  251. inline bool e_123_phone(std::string_view phone) {
  252. // https://support.secureauth.com/hc/en-us/articles/360036402211-Regular-Expressions-for-ITU-E-123-and-E-164-phone-number-formats
  253. if (phone.empty()) {
  254. return false;
  255. }
  256. if (phone[0] != '+') {
  257. constexpr size_t g_usa_phone_tokens = 3;
  258. char area[4], head[4], tail[5]; // NOLINT
  259. return sscanf(phone.data(), "(%3s) %3s %4s", area, head, tail) == g_usa_phone_tokens &&
  260. detail::is_dec(area, 3) && detail::is_dec(head, 3) && detail::is_dec(tail, 4);
  261. }
  262. char tok0[4], tok1[4], tok2[4], tok3[5]; // NOLINT
  263. constexpr size_t g_i18n_phone_tokens = 4;
  264. return sscanf(phone.data(), "+%3s %3s %3s %4s", tok0, tok1, tok2, tok3) == g_i18n_phone_tokens &&
  265. detail::is_dec(tok0, 1, 3) && detail::is_dec(tok1, 2, 3) && detail::is_dec(tok2, 2, 3) &&
  266. detail::is_dec(tok3, 4);
  267. }
  268. }
  269. namespace jvalidate::format {
  270. inline bool date(std::string_view dt) {
  271. auto [consumed, valid] = detail::date(dt);
  272. return valid && consumed == dt.size();
  273. }
  274. inline bool time(std::string_view dt) {
  275. std::tm tm;
  276. char const * end = strptime(dt.data(), "%T", &tm);
  277. if (end == nullptr || end == dt.end() || (end - dt.data()) < 8) {
  278. return false;
  279. }
  280. dt.remove_prefix(end - dt.begin());
  281. if (dt[0] == '.') {
  282. dt.remove_prefix(1);
  283. if (dt.empty() || not std::isdigit(dt[0])) {
  284. return false;
  285. }
  286. while (std::isdigit(dt[0])) {
  287. dt.remove_prefix(1);
  288. }
  289. }
  290. if (dt[0] == 'Z' || dt[0] == 'z') {
  291. return dt.size() == 1 && detail::is_leapsecond(tm);
  292. }
  293. if (std::strchr("+-", dt[0])) {
  294. dt.remove_prefix(1);
  295. return strptime(dt.data(), "%R", &tm) == dt.end() && detail::is_leapsecond(tm);
  296. }
  297. return false;
  298. }
  299. inline bool date_time(std::string_view dt) {
  300. auto [size, good] = detail::date(dt);
  301. if (not good || std::strchr("Tt", dt[size]) == nullptr) {
  302. return false;
  303. }
  304. dt.remove_prefix(size + 1);
  305. return time(dt);
  306. }
  307. template <typename CharT> inline bool uri(std::basic_string_view<CharT> uri) {
  308. using delim = detail::char_delimiters<CharT>;
  309. // https://www.rfc-editor.org/rfc/rfc3986.html#appendix-A
  310. if (size_t const pos = uri.find(':'); pos != uri.npos) {
  311. JVALIDATE_RETURN_UNLESS(std::isalpha(uri[0]), false);
  312. for (size_t i = 1; i < pos; ++i) {
  313. JVALIDATE_RETURN_UNLESS(std::isalnum(uri[i]) || std::strchr("+-.", uri[i]), false);
  314. }
  315. uri.remove_prefix(pos + 1);
  316. } else {
  317. return false;
  318. }
  319. JVALIDATE_RETURN_UNLESS(detail::test_uri_part(uri, '#'), false);
  320. JVALIDATE_RETURN_UNLESS(detail::test_uri_part(uri, '?'), false);
  321. auto path = uri;
  322. if (uri.starts_with(delim::double_slash)) {
  323. uri.remove_prefix(2);
  324. path = uri.substr(std::min(uri.size(), uri.find('/')));
  325. uri.remove_suffix(path.size());
  326. JVALIDATE_RETURN_UNLESS(detail::is_uri_authority(uri), false);
  327. }
  328. for (size_t i = 0; i < path.size(); ++i) {
  329. JVALIDATE_RETURN_UNLESS(detail::is_pchar(path, i, "/:@"), false);
  330. }
  331. return true;
  332. }
  333. template <typename CharT> inline bool uri_reference(std::basic_string_view<CharT> uri) {
  334. using delim = detail::char_delimiters<CharT>;
  335. if (jvalidate::format::uri(uri)) {
  336. return true;
  337. }
  338. JVALIDATE_RETURN_UNLESS(detail::test_uri_part(uri, '#'), false);
  339. JVALIDATE_RETURN_UNLESS(detail::test_uri_part(uri, '?'), false);
  340. auto path = uri;
  341. if (uri.starts_with(delim::double_slash)) {
  342. uri.remove_prefix(2);
  343. path = uri.substr(std::min(uri.size(), uri.find('/')));
  344. uri.remove_suffix(path.size());
  345. JVALIDATE_RETURN_UNLESS(detail::is_uri_authority(uri), false);
  346. }
  347. if (size_t const pos = path.find('/'); pos != path.npos) {
  348. for (size_t i = 0; i < pos; ++i) {
  349. JVALIDATE_RETURN_UNLESS(detail::is_pchar(path, i, "@"), false);
  350. }
  351. path.remove_prefix(pos);
  352. }
  353. for (size_t i = 0; i < path.size(); ++i) {
  354. JVALIDATE_RETURN_UNLESS(detail::is_pchar(path, i, "/:@"), false);
  355. }
  356. return true;
  357. }
  358. inline bool uri_template(std::u32string_view uri) {
  359. for (size_t i = 0; i < uri.size(); ++i) {
  360. if (uri[i] != '{') {
  361. JVALIDATE_RETURN_UNLESS(detail::is_uri_template_literal(uri, i), false);
  362. continue;
  363. }
  364. std::u32string_view expr = uri.substr(i + 1);
  365. size_t const pos = expr.find('}');
  366. JVALIDATE_RETURN_IF(pos == std::u32string_view::npos, false);
  367. JVALIDATE_RETURN_UNLESS(detail::is_uri_template_expression(expr.substr(0, pos)), false);
  368. i += pos + 1;
  369. }
  370. return true;
  371. }
  372. inline bool uuid(std::string_view id) {
  373. constexpr size_t g_uuid_len = 36;
  374. constexpr size_t g_uuid_tokens = 5;
  375. char tok0[9], tok1[5], tok2[5], tok3[5], tok4[13]; // NOLINT
  376. return id.size() == g_uuid_len &&
  377. // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
  378. sscanf(id.data(), "%8s-%4s-%4s-%4s-%12s", tok0, tok1, tok2, tok3, tok4) == g_uuid_tokens &&
  379. detail::is_hex(tok0) && detail::is_hex(tok1) && detail::is_hex(tok2) &&
  380. detail::is_hex(tok3) && detail::is_hex(tok4);
  381. }
  382. inline bool duration(std::string_view dur) {
  383. auto eat = [&dur](std::string_view text) {
  384. char type = '\0';
  385. unsigned int rep = 0;
  386. // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
  387. if (sscanf(dur.data(), "%u%c", &rep, &type) != 2 || text.find(type) == std::string::npos) {
  388. return std::string::npos;
  389. }
  390. dur.remove_prefix(dur.find(type) + 1);
  391. return text.find(type);
  392. };
  393. // All DURATION entities must start with the prefix 'P', and cannot be empty
  394. // past that point.
  395. if (dur[0] != 'P' || dur.size() == 1) {
  396. return false;
  397. }
  398. dur.remove_prefix(1);
  399. // Special Case: a duration measured in weeks is incompatible with other
  400. // duration tokens.
  401. if (eat("W") != std::string::npos) {
  402. return dur.empty();
  403. }
  404. // DURATION takes the following form, because we use the same token for both
  405. // Months and Minutes.
  406. // "P[#Y][#M][#D][T[#H][#M][#S]]".
  407. // At least one of the optional fields must be present.
  408. if (dur[0] != 'T') {
  409. std::string_view ymd{"YMD"};
  410. // Read YMD duration offsets in that order, allowing us to skip past them.
  411. while (not ymd.empty() && not dur.empty()) {
  412. if (size_t const pos = eat(ymd); pos != std::string::npos) {
  413. ymd.remove_prefix(pos + 1);
  414. } else {
  415. return false;
  416. }
  417. }
  418. if (dur.empty()) {
  419. return true;
  420. }
  421. }
  422. // If we have a 'T' prefix for Hour/Minute/Second offsets, we must have at
  423. // least one of them present.
  424. if (dur[0] != 'T' || dur.size() == 1) {
  425. return false;
  426. }
  427. dur.remove_prefix(1);
  428. std::string_view hms{"HMS"};
  429. // Read HMS duration offsets in that order, allowing us to skip past them.
  430. while (not hms.empty() && not dur.empty()) {
  431. if (size_t const pos = eat(hms); pos != std::string::npos) {
  432. hms.remove_prefix(pos + 1);
  433. } else {
  434. return false;
  435. }
  436. }
  437. return dur.empty();
  438. }
  439. template <typename CharT>
  440. bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
  441. using delim = detail::char_delimiters<CharT>;
  442. return (name.empty() || name.length() >= 64 ||
  443. (name.size() >= 4 && name.substr(2).starts_with(delim::illegal_dashes_ulabel)) ||
  444. name[0] == '-' || name.back() == '-');
  445. }
  446. #if !JVALIDATE_HAS_IDNA
  447. inline bool hostname_part(std::string_view name) {
  448. using delim = detail::char_delimiters<char>;
  449. if (is_invalid_size_or_boundary_hostname(name)) {
  450. return false;
  451. }
  452. return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
  453. }
  454. #else
  455. template <typename CharT> inline bool hostname_part(std::basic_string_view<CharT> name) {
  456. using delim = detail::char_delimiters<CharT>;
  457. // Punycode is a way to restructure UTF-8 strings to be ASCII compatibly
  458. // All Punycode string start with "xn--" (and would therefore fail below).
  459. if (name.starts_with(delim::punycode_prefix)) {
  460. std::u32string decoded = detail::to_u32(ada::idna::to_unicode(detail::to_u8(name)));
  461. return (decoded != detail::to_u32(name)) && hostname_part<char32_t>(decoded);
  462. }
  463. // Unfortunately, the ada-idna library does not validate things like
  464. // "is there a HEBREW character after the HEBREW COMMA".
  465. if (not detail::is_special_case_ok(name)) {
  466. return false;
  467. }
  468. if (name.find_first_of(delim::illegal_hostname_chars) != name.npos) {
  469. return false;
  470. }
  471. // An INVALID hostname part is one of the following:
  472. // - empty
  473. // - more than 63 characters long
  474. // - starts or ends with a '-'
  475. // - matches the regular expression /^..--.*$/
  476. if (is_invalid_size_or_boundary_hostname(name)) {
  477. return false;
  478. }
  479. // This is a much easier check in hostname than idn-hostname, since we can
  480. // just check for alphanumeric and '-'.
  481. if constexpr (std::is_same_v<char, CharT>) {
  482. return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
  483. } else {
  484. return ada::idna::is_label_valid(name);
  485. }
  486. }
  487. #endif
  488. template <typename CharT> inline bool hostname(std::basic_string_view<CharT> name) {
  489. using delim = detail::char_delimiters<CharT>;
  490. // In general, the maximum length of a hostname is 253 characters.
  491. if (name.empty() || name.length() > 253) {
  492. return false;
  493. }
  494. // We validate each sub-section of the hostname in parts, delimited by '.'
  495. for (size_t n = name.find_first_of(delim::hostname_part_delims); n != std::string::npos;
  496. name.remove_prefix(n + 1), n = name.find_first_of(delim::hostname_part_delims)) {
  497. if (not hostname_part(name.substr(0, n))) {
  498. return false;
  499. }
  500. }
  501. // Previous test versions allowed for a hostname to end with '.', but this is
  502. // not permitted in the latest test specification.
  503. return hostname_part(name);
  504. }
  505. inline bool ipv4(std::string_view ip) {
  506. unsigned int ip0, ip1, ip2, ip3; // NOLINT
  507. char eof = '\0';
  508. // IPv4 address MAY only contain DIGITS and '.'
  509. if (ip.find_first_not_of("0123456789.") != std::string_view::npos) {
  510. return false;
  511. }
  512. // Each OCTET of an IPv4 can only start with '0' if it is EXACTLY '0'
  513. if (ip[0] == '0' && std::isdigit(ip[1])) {
  514. return false;
  515. }
  516. if (size_t const pos = ip.find(".0");
  517. pos != std::string_view::npos && std::isdigit(ip[pos + 2])) {
  518. return false;
  519. }
  520. // sscanf returns the number of tokens parsed successfully.
  521. // Therefore, we can add a trailing character output to the format-string
  522. // and check that we failed to parse any token into the eof-character token.
  523. if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
  524. return false;
  525. }
  526. // Affirm that each OCTET is only two bytes wide.
  527. return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
  528. }
  529. inline bool ipv6(std::string_view ip) {
  530. int expected_spans = 8;
  531. // There is a special rule with IPv6 to allow an IPv4 address as a suffix
  532. if (size_t n = ip.find('.'); n != std::string::npos) {
  533. if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
  534. return false;
  535. }
  536. // since ipv4 addresses contain 8 bytes of information, and each segment of
  537. // an ipv6 address contains 4 bytes - we should reduce the number of
  538. // expected spans to 6. Instead - we reduce it to 7 because we don't prune
  539. // the first OCTET of the IPv4 section (as it can read as a valid segment).
  540. expected_spans = 7;
  541. ip = ip.substr(0, n);
  542. }
  543. // IPv6 address MAY only contain HEXDIGITs and ':'
  544. if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
  545. return false;
  546. }
  547. // IPv6 addresses can have a maximum of 39 characters (8 4-char HEXDIGIT
  548. // segments with 7 dividing ':'s).
  549. if (ip.size() >= 40) {
  550. return false;
  551. }
  552. bool has_compressed = false;
  553. int groups = 0;
  554. if (ip.starts_with("::")) {
  555. has_compressed = true;
  556. ip.remove_prefix(2);
  557. }
  558. while (!ip.empty() && ++groups) {
  559. int data = 0;
  560. if (sscanf(ip.data(), "%4x", &data) != 1) {
  561. // Not a 4-byte HEXDIGIT. Not sure that it's ever possible due to the
  562. // char filter above.
  563. return false;
  564. }
  565. size_t const div_pos = ip.find(':');
  566. if (std::min(div_pos, ip.size()) > 4) {
  567. return false; // Segments must be between 1 and 4 characters long
  568. }
  569. if (div_pos != std::string::npos) {
  570. ip.remove_prefix(div_pos + 1);
  571. } else {
  572. break; // End of String
  573. }
  574. // We removed the regular ':', so this is a check for a compression mark
  575. if (ip[0] != ':') {
  576. continue;
  577. }
  578. if (std::exchange(has_compressed, true)) {
  579. // The above trick allows us to ensure that there is no more than one
  580. // set of "::" compression tokens in this IPv6 adfress.
  581. return false;
  582. }
  583. ip.remove_prefix(1);
  584. }
  585. return groups == expected_spans || (has_compressed && groups < expected_spans);
  586. }
  587. // Let's be honest - no matter what RFC 5321 §4.1.2 or RFC 6531 say, the only
  588. // way to know if an email address is valid is to try and send a message to it.
  589. // Therefore, there's no point in trying to validate things according to a
  590. // complex grammar - as long as it has an '@' sign with at least one character
  591. // on each side, we ought to call it an email.
  592. template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
  593. using delim = detail::char_delimiters<CharT>;
  594. size_t const n = em.find_last_of('@');
  595. if (n == 0 || n >= em.size() - 1) {
  596. return false;
  597. }
  598. auto const who = em.substr(0, n);
  599. if (who.starts_with('"') && who.ends_with('"')) {
  600. // No validation
  601. } else if (who.starts_with('.') || who.ends_with('.')) { // NOLINT(bugprone-branch-clone)
  602. return false;
  603. } else if (em.substr(0, n).find(delim::dotdot) != em.npos) {
  604. return false;
  605. } else if (who.find('@') != em.npos) {
  606. // This will catch multiple emails, but will gracefully ignore quote-escaped
  607. // '@' characters in the name element.
  608. return false;
  609. }
  610. // The DOMAIN section of an email address MAY be either a HOSTNAME, or an
  611. // IP Address surrounded in brackets.
  612. auto domain = em.substr(n + 1);
  613. if (not(domain.starts_with('[') && domain.ends_with(']'))) {
  614. return hostname(domain);
  615. }
  616. domain.remove_prefix(1);
  617. domain.remove_suffix(1);
  618. // When the DOMAIN is an IPv6, it must start with "IPv6:" for some
  619. // weird compatibility reason.
  620. auto ip = detail::to_u8(domain);
  621. if (ip.starts_with("IPv6:")) {
  622. return ipv6(ip.substr(5));
  623. }
  624. return ipv4(ip);
  625. }
  626. template <typename T> inline bool ctor_as_valid(std::string_view str) {
  627. try {
  628. [[maybe_unused]] auto _ = T(str);
  629. return true;
  630. } catch (std::exception const &) { return false; }
  631. }
  632. #if JVALIDATE_HAS_IDNA
  633. template <auto Predicate> bool utf32(std::string_view str) {
  634. return Predicate(detail::to_u32(str));
  635. }
  636. #endif
  637. }
  638. namespace jvalidate {
  639. class FormatValidator {
  640. public:
  641. using StatelessPredicate = bool (*)(std::string_view);
  642. using Predicate = std::function<bool(std::string_view)>;
  643. using UserDefinedFormats = std::unordered_map<std::string, Predicate>;
  644. enum class Status : int8_t { Unknown, Unimplemented, Valid, Invalid };
  645. private:
  646. // This isn't actually a user format, but we don't generate any special
  647. // annotations for user-defined format codes, so it doesn't really matter that
  648. // we're putting it here. It simply reduces the number of LoC when setting up.
  649. std::unordered_map<std::string, Predicate> formats_{{"regex", nullptr}};
  650. std::unordered_map<std::string, StatelessPredicate> builtin_formats_{
  651. {"date", &format::date},
  652. {"date-time", &format::date_time},
  653. {"duration", &format::duration},
  654. {"email", &format::email},
  655. {"hostname", &format::hostname},
  656. {"idn-email", UTF32(email)},
  657. {"idn-hostname", UTF32(hostname)},
  658. {"ipv4", &format::ipv4},
  659. {"ipv6", &format::ipv6},
  660. {"iri", UTF32(uri)},
  661. {"iri-reference", UTF32(uri_reference)},
  662. {"json-pointer", CONSTRUCTS(Pointer)},
  663. {"relative-json-pointer", CONSTRUCTS(RelativePointer)},
  664. {"time", &format::time},
  665. {"uri", &format::uri},
  666. {"uri-reference", &format::uri_reference},
  667. #if JVALIDATE_HAS_IDNA
  668. {"uri-template", &format::utf32<format::uri_template>},
  669. #else
  670. {"uri-template", nullptr},
  671. #endif
  672. {"uuid", &format::uuid},
  673. };
  674. std::unordered_map<std::string, StatelessPredicate> draft03_formats_{
  675. {"date", &format::date},
  676. // One of the weird things about draft03 - date-time allows for timezone
  677. // and fraction-of-second in the argument, but time only allows hh:mm:ss.
  678. {"date-time", &format::date_time},
  679. {"time", &format::draft03::time},
  680. {"utc-millisec", &format::draft03::utc_millisec},
  681. {"color", &format::draft03::css_2_1_color},
  682. {"style", nullptr},
  683. {"phone", &format::draft03::e_123_phone},
  684. {"uri", &format::uri},
  685. {"email", &format::email},
  686. {"ip-address", &format::ipv4},
  687. {"ipv6", &format::ipv6},
  688. {"host-name", &format::hostname},
  689. };
  690. public:
  691. FormatValidator() = default;
  692. explicit(false) FormatValidator(Predicate is_regex) {
  693. formats_.insert_or_assign("regex", is_regex);
  694. }
  695. FormatValidator(UserDefinedFormats formats, Predicate is_regex) : formats_(std::move(formats)) {
  696. formats_.insert_or_assign("regex", is_regex);
  697. }
  698. Status operator()(std::string const & format, schema::Version for_version,
  699. std::string_view text) const {
  700. auto const & supported =
  701. for_version == schema::Version::Draft03 ? draft03_formats_ : builtin_formats_;
  702. if (Status rval = (*this)(supported, format, text); rval != Status::Unknown) {
  703. return rval;
  704. }
  705. return (*this)(formats_, format, text);
  706. }
  707. private:
  708. Status operator()(auto const & supported, std::string const & format,
  709. std::string_view text) const {
  710. if (auto it = supported.find(format); it != supported.end()) {
  711. if (not it->second) {
  712. return Status::Unimplemented;
  713. }
  714. return it->second(text) ? Status::Valid : Status::Invalid;
  715. }
  716. return Status::Unknown;
  717. }
  718. };
  719. }
  720. #undef CONSTRUCTS
  721. #undef UTF32
  722. /*
  723. NOLINTEND(readability-identifier-length,
  724. bugprone-inc-dec-in-conditions,
  725. cppcoreguidelines-avoid-magic-numbers,
  726. bugprone-suspicious-stringview-data-usage,
  727. readability-implicit-bool-conversion,
  728. cppcoreguidelines-narrowing-conversions,
  729. readability-identifier-length)
  730. */