format.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. #pragma once
  2. #include <jvalidate/_config.h>
  3. #include <cctype>
  4. #include <chrono>
  5. #include <cstddef>
  6. #include <cstring>
  7. #include <ctime>
  8. #include <string>
  9. #include <string_view>
  10. #include <unordered_map>
  11. #include <utility>
  12. #ifdef JVALIDATE_HAS_IDNA
  13. #include <ada/idna/to_unicode.h>
  14. #include <ada/idna/validity.h>
  15. #endif
  16. #include <jvalidate/detail/idna_special_cases.h>
  17. #include <jvalidate/detail/pointer.h>
  18. #include <jvalidate/detail/relative_pointer.h>
  19. #include <jvalidate/detail/string.h>
  20. #include <jvalidate/forward.h>
  21. #define CONSTRUCTS(TYPE) format::ctor_as_valid<detail::TYPE>
  22. #ifdef JVALIDATE_HAS_IDNA
  23. #define UTF32(FN) format::utf32<format::FN<char32_t>>
  24. #else
  25. #define UTF32(FN) nullptr
  26. #endif
  27. namespace jvalidate::format::detail {
  28. struct result {
  29. ptrdiff_t consumed;
  30. bool valid;
  31. };
  32. inline bool is_leapyear(int y) { return (y % 400) == 0 || ((y % 4) == 0 && (y % 100) != 0); }
  33. inline bool illegal_date(int y, int m, int d) {
  34. static constexpr int days[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
  35. if (is_leapyear(y) && m == 1) {
  36. --d;
  37. }
  38. return d > days[m];
  39. }
  40. inline result date(std::string_view dt) {
  41. struct tm tm;
  42. if (auto end = strptime(dt.data(), "%Y-%m-%d", &tm); end) {
  43. if ((end - dt.data()) != 10 || illegal_date(tm.tm_year + 1900, tm.tm_mon, tm.tm_mday)) {
  44. return {.consumed = 0, .valid = false};
  45. }
  46. return {.consumed = end - dt.data(), .valid = true};
  47. }
  48. return {.consumed = 0L, .valid = false};
  49. }
  50. inline bool is_leapsecond(std::tm tm) {
  51. if (tm.tm_sec != 60) {
  52. return true;
  53. }
  54. #if __cpp_lib_chrono >= 201907L
  55. tm.tm_isdst = -1;
  56. std::chrono::seconds time(std::mktime(&tm));
  57. auto const &leap_seconds = std::chrono::get_tzdb().leap_seconds;
  58. return std::ranges::find(leap_seconds, time) != leap_seconds.end();
  59. #else
  60. return false;
  61. #endif
  62. }
  63. }
  64. namespace jvalidate::format {
  65. inline bool date(std::string_view dt) {
  66. auto [consumed, valid] = detail::date(dt);
  67. return valid && consumed == dt.size();
  68. }
  69. inline bool time(std::string_view dt) {
  70. std::tm tm;
  71. char const * end = strptime(dt.data(), "%T", &tm);
  72. if (end == nullptr || end == dt.end() || (end - dt.data()) < 8) {
  73. return false;
  74. }
  75. dt.remove_prefix(end - dt.begin());
  76. if (dt[0] == '.') {
  77. dt.remove_prefix(1);
  78. if (dt.empty() || not std::isdigit(dt[0])) {
  79. return false;
  80. }
  81. while (std::isdigit(dt[0])) {
  82. dt.remove_prefix(1);
  83. }
  84. }
  85. if (dt[0] == 'Z' || dt[0] == 'z') {
  86. return dt.size() == 1 && detail::is_leapsecond(tm);
  87. }
  88. if (std::strchr("+-", dt[0])) {
  89. return strptime(dt.data() + 1, "%R", &tm) == dt.end() && detail::is_leapsecond(tm);
  90. }
  91. return false;
  92. }
  93. inline bool date_time(std::string_view dt) {
  94. auto [size, good] = detail::date(dt);
  95. if (not good || std::strchr("Tt", dt[size]) == nullptr) {
  96. return false;
  97. }
  98. dt.remove_prefix(size + 1);
  99. return time(dt);
  100. }
  101. inline bool uuid(std::string_view id) {
  102. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  103. constexpr size_t g_uuid_len = 36;
  104. constexpr size_t g_uuid_tokens = 5;
  105. char tok0[9], tok1[5], tok2[5], tok3[5], tok4[13];
  106. auto is_hex = [](std::string_view s) {
  107. return s.find_first_not_of(g_hex_digits) == std::string::npos;
  108. };
  109. return id.size() == g_uuid_len &&
  110. sscanf(id.data(), "%8s-%4s-%4s-%4s-%12s", tok0, tok1, tok2, tok3, tok4) == g_uuid_tokens &&
  111. is_hex(tok0) && is_hex(tok1) && is_hex(tok2) && is_hex(tok3) && is_hex(tok4);
  112. }
  113. inline bool duration(std::string_view dur) {
  114. auto eat = [&dur](std::string_view text) {
  115. char type;
  116. unsigned int rep;
  117. if (sscanf(dur.data(), "%u%c", &rep, &type) != 2 || text.find(type) == std::string::npos) {
  118. return std::string::npos;
  119. }
  120. dur.remove_prefix(dur.find(type) + 1);
  121. return text.find(type);
  122. };
  123. if (dur[0] != 'P' || dur.size() == 1) {
  124. return false;
  125. }
  126. dur.remove_prefix(1);
  127. if (dur[0] != 'T') {
  128. if (eat("W") != std::string::npos) {
  129. return dur.empty();
  130. }
  131. std::string_view ymd{"YMD"};
  132. while (not ymd.empty() && not dur.empty()) {
  133. if (size_t n = eat(ymd); n != std::string::npos) {
  134. ymd.remove_prefix(n + 1);
  135. } else {
  136. return false;
  137. }
  138. }
  139. if (dur.empty()) {
  140. return true;
  141. }
  142. }
  143. if (dur[0] != 'T' || dur.size() == 1) {
  144. return false;
  145. }
  146. dur.remove_prefix(1);
  147. std::string_view hms{"HMS"};
  148. while (not hms.empty() && not dur.empty()) {
  149. if (size_t n = eat(hms); n != std::string::npos) {
  150. hms.remove_prefix(n + 1);
  151. } else {
  152. return false;
  153. }
  154. }
  155. return dur.empty();
  156. }
  157. template <typename CharT> bool is_invalid_host_char(CharT c) {
  158. return c != '-' && not(std::isalnum(c) || c > 0x7F);
  159. }
  160. template <typename CharT>
  161. bool is_invalid_size_or_boundary_hostname(std::basic_string_view<CharT> name) {
  162. using delim = detail::char_delimiters<CharT>;
  163. return (name.empty() || detail::length_u8(name) >= 64 ||
  164. (name.size() >= 4 && name.substr(2).starts_with(delim::illegal_dashes_ulabel)) ||
  165. name[0] == '-' || name.back() == '-');
  166. }
  167. #ifndef JVALIDATE_HAS_IDNA
  168. inline bool hostname_part(std::string_view name) {
  169. using delim = detail::char_delimiters<char>;
  170. if (is_invalid_size_or_boundary_hostname(name)) {
  171. return false;
  172. }
  173. return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
  174. }
  175. #else
  176. template <typename CharT> inline bool hostname_part(std::basic_string_view<CharT> name) {
  177. using delim = detail::char_delimiters<CharT>;
  178. if (name.starts_with(delim::punycode_prefix)) {
  179. std::u32string decoded = detail::to_u32(ada::idna::to_unicode(detail::to_u8(name)));
  180. return (decoded != detail::to_u32(name)) && hostname_part<char32_t>(decoded);
  181. }
  182. if (is_invalid_size_or_boundary_hostname(name)) {
  183. return false;
  184. }
  185. if constexpr (std::is_same_v<char, CharT>) {
  186. return std::ranges::none_of(name, [](char c) { return c != '-' && not std::isalnum(c); });
  187. } else {
  188. return ada::idna::is_label_valid(name);
  189. }
  190. }
  191. #endif
  192. // Limitation - does not inspect graphemes, so it cannot check idn-hostname
  193. // to fix this - we'd need to
  194. template <typename CharT = char> inline bool hostname(std::basic_string_view<CharT> name) {
  195. using delim = detail::char_delimiters<CharT>;
  196. if (name.find_first_of(delim::illegal_hostname_chars) != name.npos) {
  197. return false;
  198. }
  199. if (detail::to_u8(name).size() > (name.back() == '.' ? 254 : 253)) {
  200. return false;
  201. }
  202. if (not std::ranges::all_of(delim::special_cases,
  203. [name](auto & sc) { return sc.accepts(name); })) {
  204. return false;
  205. }
  206. for (size_t n = name.find('.'); n != std::string::npos;
  207. name.remove_prefix(n + 1), n = name.find('.')) {
  208. if (not hostname_part(name.substr(0, n))) {
  209. return false;
  210. }
  211. }
  212. return name.empty() || hostname_part(name);
  213. }
  214. inline bool ipv4(std::string_view ip) {
  215. unsigned int ip0, ip1, ip2, ip3;
  216. char eof;
  217. if (ip.find_first_not_of("0123456789.") != std::string::npos) {
  218. return false;
  219. }
  220. if (ip[0] == '0' && std::isdigit(ip[1])) {
  221. return false;
  222. }
  223. if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
  224. return false;
  225. }
  226. if (sscanf(std::string(ip).c_str(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
  227. return false;
  228. }
  229. return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
  230. }
  231. inline bool ipv6(std::string_view ip) {
  232. int expected_spans = 8;
  233. if (size_t n = ip.find('.'); n != std::string::npos) {
  234. if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
  235. return false;
  236. }
  237. // This is a cheat to allow e.g. ::127.0.0.1 to validate
  238. expected_spans = 7;
  239. ip = ip.substr(0, n);
  240. }
  241. if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
  242. return false;
  243. }
  244. if (ip.size() >= 40) {
  245. return false;
  246. }
  247. bool has_compressed = false;
  248. int groups = 0;
  249. if (ip.starts_with("::")) {
  250. has_compressed = true;
  251. ip.remove_prefix(2);
  252. }
  253. while (!ip.empty()) {
  254. int data;
  255. if (sscanf(ip.data(), "%4x", &data) != 1) {
  256. return false;
  257. }
  258. if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
  259. return false;
  260. } else if (n != std::string::npos) {
  261. ip.remove_prefix(n + 1);
  262. } else {
  263. ip = "";
  264. }
  265. ++groups;
  266. if (ip[0] == ':') {
  267. if (std::exchange(has_compressed, true)) {
  268. return false;
  269. }
  270. ip.remove_prefix(1);
  271. }
  272. }
  273. return groups == expected_spans || (has_compressed && groups < expected_spans);
  274. }
  275. // Let's be honest - no matter what RFC 5321 §4.1.2 or RFC 6531 say, the only
  276. // way to know if an email address is valid is to try and send a message to it.
  277. // Therefore, there's no point in trying to validate things according to a
  278. // complex grammar - as long as it has an '@' sign with at least one character
  279. // on each side, we ought to call it an email.
  280. template <typename CharT = char> inline bool email(std::basic_string_view<CharT> em) {
  281. using delim = detail::char_delimiters<CharT>;
  282. size_t n = em.find_last_of('@');
  283. if (n == 0 || n >= em.size() - 1) {
  284. return false;
  285. }
  286. auto const who = em.substr(0, n);
  287. if (who.starts_with('"') && who.ends_with('"')) {
  288. // No validation
  289. } else if (who.starts_with('.') || who.ends_with('.')) {
  290. return false;
  291. } else if (em.substr(0, n).find(delim::dotdot) != em.npos) {
  292. return false;
  293. }
  294. auto domain = em.substr(n + 1);
  295. if (not(domain.starts_with('[') && domain.ends_with(']'))) {
  296. return hostname(domain);
  297. }
  298. domain.remove_prefix(1);
  299. domain.remove_suffix(1);
  300. if (auto ip = detail::to_u8(domain); ip.starts_with("IPv6:")) {
  301. return ipv6(ip.substr(5));
  302. } else {
  303. return ipv4(ip);
  304. }
  305. }
  306. template <typename T> inline bool ctor_as_valid(std::string_view str) {
  307. try {
  308. [[maybe_unused]] auto _ = T(str);
  309. return true;
  310. } catch (std::exception const &) { return false; }
  311. }
  312. template <auto Predicate> bool utf32(std::string_view str) {
  313. return Predicate(detail::to_u32(str));
  314. }
  315. }
  316. namespace jvalidate {
  317. class FormatValidator {
  318. public:
  319. using Predicate = bool (*)(std::string_view);
  320. enum class Status { Unknown, Unimplemented, Valid, Invalid };
  321. private:
  322. std::unordered_map<std::string, Predicate> supported_formats_{
  323. {"date", &format::date},
  324. {"date-time", &format::date_time},
  325. {"duration", &format::duration},
  326. {"email", &format::email},
  327. {"hostname", &format::hostname},
  328. {"idn-email", UTF32(email)},
  329. {"idn-hostname", UTF32(hostname)},
  330. {"ipv4", &format::ipv4},
  331. {"ipv6", &format::ipv6},
  332. {"iri", nullptr},
  333. {"iri-reference", nullptr},
  334. {"json-pointer", CONSTRUCTS(Pointer)},
  335. {"relative-json-pointer", CONSTRUCTS(RelativePointer)},
  336. {"regex", nullptr},
  337. {"time", &format::time},
  338. {"uri", nullptr},
  339. {"uri-reference", nullptr},
  340. {"uri-template", nullptr},
  341. {"uuid", &format::uuid},
  342. };
  343. public:
  344. FormatValidator() = default;
  345. FormatValidator(Predicate is_regex) { supported_formats_.insert_or_assign("regex", is_regex); }
  346. Status operator()(std::string const & format, std::string_view text) const {
  347. if (auto it = supported_formats_.find(format); it != supported_formats_.end() && it->second) {
  348. if (not it->second) {
  349. return Status::Unimplemented;
  350. }
  351. return it->second(text) ? Status::Valid : Status::Invalid;
  352. }
  353. return Status::Unknown;
  354. }
  355. };
  356. }
  357. #undef CONSTRUCTS
  358. #undef UTF32