format.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. #pragma once
  2. #include <jvalidate/_macro.h>
  3. #include <cctype>
  4. #include <chrono>
  5. #include <cstddef>
  6. #include <cstring>
  7. #include <ctime>
  8. #include <string>
  9. #include <string_view>
  10. #include <unordered_map>
  11. #include <utility>
  12. #ifdef JVALIDATE_HAS_IDNA
  13. #include <ada/idna/to_unicode.h>
  14. #include <ada/idna/validity.h>
  15. #endif
  16. #include <jvalidate/detail/expect.h>
  17. #include <jvalidate/detail/idna_special_cases.h>
  18. #include <jvalidate/detail/pointer.h>
  19. #include <jvalidate/detail/relative_pointer.h>
  20. #include <jvalidate/detail/string.h>
  21. #include <jvalidate/forward.h>
  22. #define CONSTRUCTS(TYPE) format::ctor_as_valid<detail::TYPE>
  23. #define UTF32(FN) JVALIDATE_IIF(JVALIDATE_HAS_IDNA, format::utf32<format::FN<char32_t>>, nullptr)
  24. namespace jvalidate::format {
  25. bool date(std::string_view dt);
  26. bool time(std::string_view dt);
  27. bool date_time(std::string_view dt);
  28. bool duration(std::string_view dur);
  29. template <typename CharT = char> bool uri(std::basic_string_view<CharT> uri);
  30. bool uuid(std::string_view id);
  31. template <typename CharT = char> bool hostname(std::basic_string_view<CharT> name);
  32. bool ipv4(std::string_view ip);
  33. bool ipv6(std::string_view ip);
  34. template <typename CharT = char> bool email(std::basic_string_view<CharT> em);
  35. }
  36. namespace jvalidate::format::detail {
  37. struct result {
  38. ptrdiff_t consumed;
  39. bool valid;
  40. };
  41. inline bool is_leapyear(int y) { return (y % 400) == 0 || ((y % 4) == 0 && (y % 100) != 0); }
  42. inline bool illegal_date(int y, int m, int d) {
  43. static constexpr int days[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
  44. if (is_leapyear(y) && m == 1) {
  45. --d;
  46. }
  47. return d > days[m];
  48. }
  49. inline result date(std::string_view dt) {
  50. struct tm tm;
  51. if (auto end = strptime(dt.data(), "%Y-%m-%d", &tm); end) {
  52. if ((end - dt.data()) != 10 || illegal_date(tm.tm_year + 1900, tm.tm_mon, tm.tm_mday)) {
  53. return {.consumed = 0, .valid = false};
  54. }
  55. return {.consumed = end - dt.data(), .valid = true};
  56. }
  57. return {.consumed = 0L, .valid = false};
  58. }
  59. inline bool is_leapsecond(std::tm tm) {
  60. if (tm.tm_sec != 60) {
  61. return true;
  62. }
  63. #if __cpp_lib_chrono >= 201907L
  64. tm.tm_isdst = -1;
  65. std::chrono::seconds time(std::mktime(&tm));
  66. auto const & leap_seconds = std::chrono::get_tzdb().leap_seconds;
  67. return std::ranges::find(leap_seconds, time) != leap_seconds.end();
  68. #else
  69. return false;
  70. #endif
  71. }
  72. inline bool is_uschar(int c) {
  73. using P = std::pair<int, int>;
  74. constexpr std::array data{
  75. P{0xA0, 0xD7FF}, P{0xF900, 0xFDCF}, P{0xFDF0, 0xFFEF}, P{0x10000, 0x1FFFD},
  76. P{0x20000, 0x2FFFD}, P{0x30000, 0x3FFFD}, P{0x40000, 0x4FFFD}, P{0x50000, 0x5FFFD},
  77. P{0x60000, 0x6FFFD}, P{0x70000, 0x7FFFD}, P{0x80000, 0x8FFFD}, P{0x90000, 0x9FFFD},
  78. P{0xA0000, 0xAFFFD}, P{0xB0000, 0xBFFFD}, P{0xC0000, 0xCFFFD}, P{0xD0000, 0xDFFFD},
  79. P{0xE0000, 0xEFFFD},
  80. };
  81. return std::ranges::any_of(data,
  82. [c](auto & pair) { return c >= pair.first && c <= pair.second; });
  83. }
  84. template <typename CharT>
  85. inline bool is_pchar(std::basic_string_view<CharT> part, size_t & pos,
  86. std::string_view extra_valid_chars = ":@") {
  87. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  88. if (std::isalnum(part[pos]) || is_uschar(part[pos]) ||
  89. std::strchr("-._~!$&'()*+,;=", part[pos])) {
  90. return true;
  91. }
  92. if (part[pos] == '%') {
  93. return std::strchr(g_hex_digits, part[++pos]) && std::strchr(g_hex_digits, part[++pos]);
  94. }
  95. return extra_valid_chars.find(part[pos]) != part.npos;
  96. };
  97. template <typename CharT> inline bool is_uri_authority(std::basic_string_view<CharT> uri) {
  98. if (size_t pos = uri.find('@'); pos != uri.npos && pos < uri.find('/')) {
  99. for (size_t i = 0; i < pos; ++i) {
  100. if (not is_pchar(uri, i, ":")) {
  101. return false;
  102. }
  103. }
  104. uri.remove_prefix(pos + 1);
  105. }
  106. if (uri[0] == '[') {
  107. size_t pos = uri.find(']');
  108. auto ip = uri.substr(1, pos - 1);
  109. uri.remove_prefix(pos + 1);
  110. if (not ipv6(to_u8(ip))) {
  111. return false;
  112. }
  113. }
  114. if (size_t pos = uri.find(':'); pos != uri.npos) {
  115. if (not std::ranges::all_of(uri.substr(pos + 1), [](auto c) { return std::isdigit(c); })) {
  116. return false;
  117. }
  118. uri.remove_suffix(uri.size() - pos + 1);
  119. }
  120. return ipv4(to_u8(uri)) || hostname(uri);
  121. }
  122. }
  123. namespace jvalidate::format {
  124. inline bool date(std::string_view dt) {
  125. auto [consumed, valid] = detail::date(dt);
  126. return valid && consumed == dt.size();
  127. }
  128. inline bool time(std::string_view dt) {
  129. std::tm tm;
  130. char const * end = strptime(dt.data(), "%T", &tm);
  131. if (end == nullptr || end == dt.end() || (end - dt.data()) < 8) {
  132. return false;
  133. }
  134. dt.remove_prefix(end - dt.begin());
  135. if (dt[0] == '.') {
  136. dt.remove_prefix(1);
  137. if (dt.empty() || not std::isdigit(dt[0])) {
  138. return false;
  139. }
  140. while (std::isdigit(dt[0])) {
  141. dt.remove_prefix(1);
  142. }
  143. }
  144. if (dt[0] == 'Z' || dt[0] == 'z') {
  145. return dt.size() == 1 && detail::is_leapsecond(tm);
  146. }
  147. if (std::strchr("+-", dt[0])) {
  148. return strptime(dt.data() + 1, "%R", &tm) == dt.end() && detail::is_leapsecond(tm);
  149. }
  150. return false;
  151. }
  152. inline bool date_time(std::string_view dt) {
  153. auto [size, good] = detail::date(dt);
  154. if (not good || std::strchr("Tt", dt[size]) == nullptr) {
  155. return false;
  156. }
  157. dt.remove_prefix(size + 1);
  158. return time(dt);
  159. }
  160. template <typename CharT> inline bool uri(std::basic_string_view<CharT> uri) {
  161. using delim = detail::char_delimiters<CharT>;
  162. auto test_uri_part = [&uri](char delim) {
  163. size_t const pos = uri.find(delim);
  164. if (pos == uri.npos) {
  165. return true;
  166. }
  167. auto part = uri.substr(pos + 1);
  168. uri = uri.substr(0, pos);
  169. for (size_t pos = 0; pos < part.size(); ++pos) {
  170. RETURN_UNLESS(detail::is_pchar(part, pos, ":@/?"), false);
  171. }
  172. return true;
  173. };
  174. // https://www.rfc-editor.org/rfc/rfc3986.html#appendix-A
  175. if (size_t const pos = uri.find(':'); pos != uri.npos) {
  176. RETURN_UNLESS(std::isalpha(uri[0]), false);
  177. for (size_t i = 1; i < pos; ++i) {
  178. RETURN_UNLESS(std::isalnum(uri[i]) || std::strchr("+-.", uri[i]), false);
  179. }
  180. uri.remove_prefix(pos + 1);
  181. } else {
  182. return false;
  183. }
  184. RETURN_UNLESS(test_uri_part('#'), false);
  185. RETURN_UNLESS(test_uri_part('?'), false);
  186. auto path = uri;
  187. if (uri.starts_with(delim::double_slash)) {
  188. uri.remove_prefix(2);
  189. path = uri.substr(std::min(uri.size(), uri.find('/')));
  190. uri.remove_suffix(path.size());
  191. RETURN_UNLESS(detail::is_uri_authority(uri), false);
  192. }
  193. if (size_t const pos = path.find('/'); pos != path.npos) {
  194. for (size_t i = 0; i < pos; ++i) {
  195. RETURN_UNLESS(detail::is_pchar(path, i, "@"), false);
  196. }
  197. path.remove_prefix(pos);
  198. }
  199. for (size_t i = 0; i < path.size(); ++i) {
  200. RETURN_UNLESS(detail::is_pchar(path, i, "/:@"), false);
  201. }
  202. return true;
  203. }
  204. inline bool uuid(std::string_view id) {
  205. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  206. constexpr size_t g_uuid_len = 36;
  207. constexpr size_t g_uuid_tokens = 5;
  208. char tok0[9], tok1[5], tok2[5], tok3[5], tok4[13];
  209. auto is_hex = [](std::string_view s) {
  210. return s.find_first_not_of(g_hex_digits) == std::string::npos;
  211. };
  212. return id.size() == g_uuid_len &&
  213. sscanf(id.data(), "%8s-%4s-%4s-%4s-%12s", tok0, tok1, tok2, tok3, tok4) == g_uuid_tokens &&
  214. is_hex(tok0) && is_hex(tok1) && is_hex(tok2) && is_hex(tok3) && is_hex(tok4);
  215. }
  216. inline bool duration(std::string_view dur) {
  217. auto eat = [&dur](std::string_view text) {
  218. char type;
  219. unsigned int rep;
  220. if (sscanf(dur.data(), "%u%c", &rep, &type) != 2 || text.find(type) == std::string::npos) {
  221. return std::string::npos;
  222. }
  223. dur.remove_prefix(dur.find(type) + 1);
  224. return text.find(type);
  225. };
  226. if (dur[0] != 'P' || dur.size() == 1) {
  227. return false;
  228. }
  229. dur.remove_prefix(1);
  230. if (dur[0] != 'T') {
  231. if (eat("W") != std::string::npos) {
  232. return dur.empty();
  233. }
  234. std::string_view ymd{"YMD"};
  235. while (not ymd.empty() && not dur.empty()) {
  236. if (size_t n = eat(ymd); n != std::string::npos) {
  237. ymd.remove_prefix(n + 1);
  238. } else {
  239. return false;
  240. }
  241. }
  242. if (dur.empty()) {
  243. return true;
  244. }
  245. }
  246. if (dur[0] != 'T' || dur.size() == 1) {
  247. return false;
  248. }
  249. dur.remove_prefix(1);
  250. std::string_view hms{"HMS"};
  251. while (not hms.empty() && not dur.empty()) {
  252. if (size_t n = eat(hms); n != std::string::npos) {
  253. hms.remove_prefix(n + 1);
  254. } else {
  255. return false;
  256. }
  257. }
  258. return dur.empty();
  259. }
  260. template <typename CharT> bool is_invalid_host_char(CharT c) {
  261. return c != '-' && not(std::isalnum(c) || c > 0x7F);
  262. }
  263. template <typename CharT>
  264. bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
  265. using delim = detail::char_delimiters<CharT>;
  266. return (name.empty() || detail::length_u8(name) >= 64 ||
  267. (name.size() >= 4 && name.substr(2).starts_with(delim::illegal_dashes_ulabel)) ||
  268. name[0] == '-' || name.back() == '-');
  269. }
  270. #if !JVALIDATE_HAS_IDNA
  271. inline bool hostname_part(std::string_view name) {
  272. using delim = detail::char_delimiters<char>;
  273. if (is_invalid_size_or_boundary_hostname(name)) {
  274. return false;
  275. }
  276. return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
  277. }
  278. #else
  279. template <typename CharT> inline bool hostname_part(std::basic_string_view<CharT> name) {
  280. using delim = detail::char_delimiters<CharT>;
  281. if (name.starts_with(delim::punycode_prefix)) {
  282. std::u32string decoded = detail::to_u32(ada::idna::to_unicode(detail::to_u8(name)));
  283. return (decoded != detail::to_u32(name)) && hostname_part<char32_t>(decoded);
  284. }
  285. if (is_invalid_size_or_boundary_hostname(name)) {
  286. return false;
  287. }
  288. if constexpr (std::is_same_v<char, CharT>) {
  289. return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
  290. } else {
  291. return ada::idna::is_label_valid(name);
  292. }
  293. }
  294. #endif
  295. template <typename CharT> inline bool hostname(std::basic_string_view<CharT> name) {
  296. using delim = detail::char_delimiters<CharT>;
  297. if (name.find_first_of(delim::illegal_hostname_chars) != name.npos) {
  298. return false;
  299. }
  300. if (detail::to_u8(name).size() > (name.back() == '.' ? 254 : 253)) {
  301. return false;
  302. }
  303. if (not std::ranges::all_of(delim::special_cases,
  304. [name](auto & sc) { return sc.accepts(name); })) {
  305. return false;
  306. }
  307. for (size_t n = name.find('.'); n != std::string::npos;
  308. name.remove_prefix(n + 1), n = name.find('.')) {
  309. if (not hostname_part(name.substr(0, n))) {
  310. return false;
  311. }
  312. }
  313. return name.empty() || hostname_part(name);
  314. }
  315. inline bool ipv4(std::string_view ip) {
  316. unsigned int ip0, ip1, ip2, ip3;
  317. char eof;
  318. if (ip.find_first_not_of("0123456789.") != std::string::npos) {
  319. return false;
  320. }
  321. if (ip[0] == '0' && std::isdigit(ip[1])) {
  322. return false;
  323. }
  324. if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
  325. return false;
  326. }
  327. if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
  328. return false;
  329. }
  330. return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
  331. }
  332. inline bool ipv6(std::string_view ip) {
  333. int expected_spans = 8;
  334. if (size_t n = ip.find('.'); n != std::string::npos) {
  335. if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
  336. return false;
  337. }
  338. // This is a cheat to allow e.g. ::127.0.0.1 to validate
  339. expected_spans = 7;
  340. ip = ip.substr(0, n);
  341. }
  342. if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
  343. return false;
  344. }
  345. if (ip.size() >= 40) {
  346. return false;
  347. }
  348. bool has_compressed = false;
  349. int groups = 0;
  350. if (ip.starts_with("::")) {
  351. has_compressed = true;
  352. ip.remove_prefix(2);
  353. }
  354. while (!ip.empty()) {
  355. int data;
  356. if (sscanf(ip.data(), "%4x", &data) != 1) {
  357. return false;
  358. }
  359. if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
  360. return false;
  361. } else if (n != std::string::npos) {
  362. ip.remove_prefix(n + 1);
  363. } else {
  364. ip = "";
  365. }
  366. ++groups;
  367. if (ip[0] == ':') {
  368. if (std::exchange(has_compressed, true)) {
  369. return false;
  370. }
  371. ip.remove_prefix(1);
  372. }
  373. }
  374. return groups == expected_spans || (has_compressed && groups < expected_spans);
  375. }
  376. // Let's be honest - no matter what RFC 5321 §4.1.2 or RFC 6531 say, the only
  377. // way to know if an email address is valid is to try and send a message to it.
  378. // Therefore, there's no point in trying to validate things according to a
  379. // complex grammar - as long as it has an '@' sign with at least one character
  380. // on each side, we ought to call it an email.
  381. template <typename CharT> inline bool email(std::basic_string_view<CharT> em) {
  382. using delim = detail::char_delimiters<CharT>;
  383. size_t n = em.find_last_of('@');
  384. if (n == 0 || n >= em.size() - 1) {
  385. return false;
  386. }
  387. auto const who = em.substr(0, n);
  388. if (who.starts_with('"') && who.ends_with('"')) {
  389. // No validation
  390. } else if (who.starts_with('.') || who.ends_with('.')) {
  391. return false;
  392. } else if (em.substr(0, n).find(delim::dotdot) != em.npos) {
  393. return false;
  394. }
  395. auto domain = em.substr(n + 1);
  396. if (not(domain.starts_with('[') && domain.ends_with(']'))) {
  397. return hostname(domain);
  398. }
  399. domain.remove_prefix(1);
  400. domain.remove_suffix(1);
  401. if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
  402. return ipv6(ip.substr(5));
  403. } else {
  404. return ipv4(ip);
  405. }
  406. }
  407. template <typename T> inline bool ctor_as_valid(std::string_view str) {
  408. try {
  409. [[maybe_unused]] auto _ = T(str);
  410. return true;
  411. } catch (std::exception const &) { return false; }
  412. }
  413. template <auto Predicate> bool utf32(std::string_view str) {
  414. return Predicate(detail::to_u32(str));
  415. }
  416. }
  417. namespace jvalidate {
  418. class FormatValidator {
  419. public:
  420. using Predicate = bool (*)(std::string_view);
  421. enum class Status { Unknown, Unimplemented, Valid, Invalid };
  422. private:
  423. std::unordered_map<std::string, Predicate> supported_formats_{
  424. {"date", &format::date},
  425. {"date-time", &format::date_time},
  426. {"duration", &format::duration},
  427. {"email", &format::email},
  428. {"hostname", &format::hostname},
  429. {"idn-email", UTF32(email)},
  430. {"idn-hostname", UTF32(hostname)},
  431. {"ipv4", &format::ipv4},
  432. {"ipv6", &format::ipv6},
  433. {"iri", UTF32(uri)},
  434. {"iri-reference", nullptr},
  435. {"json-pointer", CONSTRUCTS(Pointer)},
  436. {"relative-json-pointer", CONSTRUCTS(RelativePointer)},
  437. {"regex", nullptr},
  438. {"time", &format::time},
  439. {"uri", &format::uri},
  440. {"uri-reference", nullptr},
  441. {"uri-template", nullptr},
  442. {"uuid", &format::uuid},
  443. };
  444. public:
  445. FormatValidator() = default;
  446. FormatValidator(Predicate is_regex) { supported_formats_.insert_or_assign("regex", is_regex); }
  447. Status operator()(std::string const & format, std::string_view text) const {
  448. if (auto it = supported_formats_.find(format); it != supported_formats_.end() && it->second) {
  449. if (not it->second) {
  450. return Status::Unimplemented;
  451. }
  452. return it->second(text) ? Status::Valid : Status::Invalid;
  453. }
  454. return Status::Unknown;
  455. }
  456. };
  457. }
  458. #undef CONSTRUCTS
  459. #undef UTF32