format.h 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. #pragma once
  2. #include <cctype>
  3. #include <cstddef>
  4. #include <cstring>
  5. #include <ctime>
  6. #include <string>
  7. #include <string_view>
  8. #include <unordered_map>
  9. #include <utility>
  10. #include <jvalidate/detail/pointer.h>
  11. #include <jvalidate/detail/relative_pointer.h>
  12. #include <jvalidate/detail/string.h>
  13. #include <jvalidate/forward.h>
  14. #define CONSTRUCTS(TYPE) format::ctor_as_valid<detail::TYPE>
  15. #define UTF32(FN) format::utf32<format::FN<char32_t>>
  16. namespace jvalidate::format::detail {
  17. using namespace jvalidate::detail;
  18. struct result {
  19. ptrdiff_t consumed;
  20. bool valid;
  21. };
  22. inline bool is_leapyear(int y) { return (y % 400) == 0 || ((y % 4) == 0 && (y % 100) != 0); }
  23. inline bool illegal_date(int y, int m, int d) {
  24. static constexpr int days[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
  25. if (is_leapyear(y) && m == 1) {
  26. --d;
  27. }
  28. return d > days[m];
  29. }
  30. inline result date(std::string_view dt) {
  31. struct tm tm;
  32. if (auto end = strptime(dt.data(), "%Y-%m-%d", &tm); end) {
  33. if ((end - dt.data()) != 10 || illegal_date(tm.tm_year + 1900, tm.tm_mon, tm.tm_mday)) {
  34. return {.consumed = 0, .valid = false};
  35. }
  36. return {.consumed = end - dt.data(), .valid = true};
  37. }
  38. return {.consumed = 0L, .valid = false};
  39. }
  40. }
  41. namespace jvalidate::format {
  42. inline bool date(std::string_view dt) {
  43. auto [consumed, valid] = detail::date(dt);
  44. return valid && consumed == dt.size();
  45. }
  46. inline bool time(std::string_view dt) {
  47. struct tm tm;
  48. char const * end = strptime(dt.data(), "%T", &tm);
  49. if (end == nullptr || end == dt.end() || (end - dt.data()) < 8) {
  50. return false;
  51. }
  52. dt.remove_prefix(end - dt.begin());
  53. if (dt[0] == '.') {
  54. dt.remove_prefix(1);
  55. if (dt.empty() || not std::isdigit(dt[0])) {
  56. return false;
  57. }
  58. while (std::isdigit(dt[0])) {
  59. dt.remove_prefix(1);
  60. }
  61. }
  62. if (dt[0] == 'Z' || dt[0] == 'z') {
  63. return dt.size() == 1;
  64. }
  65. if (std::strchr("+-", dt[0])) {
  66. return strptime(dt.data() + 1, "%R", &tm) == dt.end();
  67. }
  68. return false;
  69. }
  70. inline bool date_time(std::string_view dt) {
  71. auto [size, good] = detail::date(dt);
  72. if (not good || std::strchr("Tt", dt[size]) == nullptr) {
  73. return false;
  74. }
  75. dt.remove_prefix(size + 1);
  76. return time(dt);
  77. }
  78. inline bool uuid(std::string_view id) {
  79. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  80. constexpr size_t g_uuid_len = 36;
  81. constexpr size_t g_uuid_tokens = 5;
  82. char tok0[9], tok1[5], tok2[5], tok3[5], tok4[13];
  83. auto is_hex = [](std::string_view s) {
  84. return s.find_first_not_of(g_hex_digits) == std::string::npos;
  85. };
  86. return id.size() == g_uuid_len &&
  87. sscanf(id.data(), "%8s-%4s-%4s-%4s-%12s", tok0, tok1, tok2, tok3, tok4) == g_uuid_tokens &&
  88. is_hex(tok0) && is_hex(tok1) && is_hex(tok2) && is_hex(tok3) && is_hex(tok4);
  89. }
  90. inline bool duration(std::string_view dur) {
  91. auto eat = [&dur](std::string_view text) {
  92. char type;
  93. unsigned int rep;
  94. if (sscanf(dur.data(), "%u%c", &rep, &type) != 2 || text.find(type) == std::string::npos) {
  95. return std::string::npos;
  96. }
  97. dur.remove_prefix(dur.find(type) + 1);
  98. return text.find(type);
  99. };
  100. if (dur[0] != 'P' || dur.size() == 1) {
  101. return false;
  102. }
  103. dur.remove_prefix(1);
  104. if (dur[0] != 'T') {
  105. if (eat("W") != std::string::npos) {
  106. return dur.empty();
  107. }
  108. std::string_view ymd{"YMD"};
  109. while (not ymd.empty() && not dur.empty()) {
  110. if (size_t n = eat(ymd); n != std::string::npos) {
  111. ymd.remove_prefix(n + 1);
  112. } else {
  113. return false;
  114. }
  115. }
  116. if (dur.empty()) {
  117. return true;
  118. }
  119. }
  120. if (dur[0] != 'T' || dur.size() == 1) {
  121. return false;
  122. }
  123. dur.remove_prefix(1);
  124. std::string_view hms{"HMS"};
  125. while (not hms.empty() && not dur.empty()) {
  126. if (size_t n = eat(hms); n != std::string::npos) {
  127. hms.remove_prefix(n + 1);
  128. } else {
  129. return false;
  130. }
  131. }
  132. return dur.empty();
  133. }
  134. // Limitation - does not inspect graphemes, so it cannot check idn-hostname
  135. // to fix this - we'd need to
  136. template <typename CharT = char> inline bool hostname(std::basic_string_view<CharT> name) {
  137. auto hostname_part = [&name](size_t end) {
  138. if (end == 0 || end >= 64 || name[0] == '-' || name[end - 1] == '-') {
  139. return false;
  140. }
  141. for (size_t i = 0; i < end; ++i) {
  142. if (name[i] != '-' && not std::isalnum(name[i])) {
  143. return false;
  144. }
  145. }
  146. return true;
  147. };
  148. if (name.size() > (name.back() == '.' ? 254 : 253)) {
  149. return false;
  150. }
  151. for (size_t n = name.find('.'); n != std::string::npos;
  152. name.remove_prefix(n + 1), n = name.find('.')) {
  153. if (not hostname_part(n)) {
  154. return false;
  155. }
  156. }
  157. return name.empty() || hostname_part(name.size());
  158. }
  159. inline bool ipv4(std::string_view ip) {
  160. unsigned int ip0, ip1, ip2, ip3;
  161. char eof;
  162. if (ip.find_first_not_of("0123456789.") != std::string::npos) {
  163. return false;
  164. }
  165. if (ip[0] == '0' && std::isdigit(ip[1])) {
  166. return false;
  167. }
  168. if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
  169. return false;
  170. }
  171. if (sscanf(ip.data(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
  172. return false;
  173. }
  174. return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
  175. }
  176. inline bool ipv6(std::string_view ip) {
  177. int expected_spans = 8;
  178. if (size_t n = ip.find('.'); n != std::string::npos) {
  179. if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
  180. return false;
  181. }
  182. // This is a cheat to allow e.g. ::127.0.0.1 to validate
  183. expected_spans = 7;
  184. ip = ip.substr(0, n);
  185. }
  186. if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
  187. return false;
  188. }
  189. if (ip.size() >= 40) {
  190. return false;
  191. }
  192. bool has_compressed = false;
  193. int groups = 0;
  194. if (ip.starts_with("::")) {
  195. has_compressed = true;
  196. ip.remove_prefix(2);
  197. }
  198. while (!ip.empty()) {
  199. int data;
  200. if (sscanf(ip.data(), "%4x", &data) != 1) {
  201. return false;
  202. }
  203. if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
  204. return false;
  205. } else if (n != std::string::npos) {
  206. ip.remove_prefix(n + 1);
  207. } else {
  208. ip = "";
  209. }
  210. ++groups;
  211. if (ip[0] == ':') {
  212. if (std::exchange(has_compressed, true)) {
  213. return false;
  214. }
  215. ip.remove_prefix(1);
  216. }
  217. }
  218. return groups == expected_spans || (has_compressed && groups < expected_spans);
  219. }
  220. // Let's be honest - no matter what RFC 5321 §4.1.2 or RFC 6531 say, the only
  221. // way to know if an email address is valid is to try and send a message to it.
  222. // Therefore, there's no point in trying to validate things according to a
  223. // complex grammar - as long as it has an '@' sign with at least one character
  224. // on each side, we ought to call it an email.
  225. template <typename CharT = char> inline bool email(std::basic_string_view<CharT> em) {
  226. size_t n = em.find_last_of('@');
  227. if (n == 0 || n >= em.size() - 1) {
  228. return false;
  229. }
  230. auto const who = em.substr(0, n);
  231. if (who.starts_with('"') && who.ends_with('"')) {
  232. // No validation
  233. } else if (who.starts_with('.') || who.ends_with('.')) {
  234. return false;
  235. } else if (CharT const dots[3] = {'.', '.', '\0'}; em.substr(0, n).find(dots) != em.npos) {
  236. return false;
  237. }
  238. auto domain = em.substr(n + 1);
  239. if (not(domain.starts_with('[') && domain.ends_with(']'))) {
  240. return hostname(domain);
  241. }
  242. domain.remove_prefix(1);
  243. domain.remove_suffix(1);
  244. std::string ip(domain.size(), '\0'); // Re-acquiring the NULL terminator
  245. std::ranges::copy(domain, ip.begin());
  246. if (ip.starts_with("IPv6:")) {
  247. return ipv6(ip.substr(5));
  248. }
  249. return ipv4(ip);
  250. }
  251. template <typename T> inline bool ctor_as_valid(std::string_view str) {
  252. try {
  253. [[maybe_unused]] auto _ = T(str);
  254. return true;
  255. } catch (std::exception const &) { return false; }
  256. }
  257. template <auto Predicate> bool utf32(std::string_view str) {
  258. return Predicate(detail::to_u32(str));
  259. }
  260. }
  261. namespace jvalidate {
  262. class FormatValidator {
  263. public:
  264. using Predicate = bool (*)(std::string_view);
  265. enum class Status { Unknown, Unimplemented, Valid, Invalid };
  266. private:
  267. std::unordered_map<std::string, Predicate> supported_formats_{
  268. {"date", &format::date},
  269. {"date-time", &format::date_time},
  270. {"duration", &format::duration},
  271. {"email", &format::email},
  272. {"hostname", &format::hostname},
  273. {"idn-email", UTF32(email)},
  274. {"idn-hostname", UTF32(hostname)},
  275. {"ipv4", &format::ipv4},
  276. {"ipv6", &format::ipv6},
  277. {"iri", nullptr},
  278. {"iri-reference", nullptr},
  279. {"json-pointer", CONSTRUCTS(Pointer)},
  280. {"relative-json-pointer", CONSTRUCTS(RelativePointer)},
  281. {"regex", nullptr},
  282. {"time", &format::time},
  283. {"uri", nullptr},
  284. {"uri-reference", nullptr},
  285. {"uri-template", nullptr},
  286. {"uuid", &format::uuid},
  287. };
  288. public:
  289. FormatValidator() = default;
  290. FormatValidator(Predicate is_regex) { supported_formats_.insert_or_assign("regex", is_regex); }
  291. Status operator()(std::string const & format, std::string_view text) const {
  292. if (auto it = supported_formats_.find(format); it != supported_formats_.end() && it->second) {
  293. if (not it->second) {
  294. return Status::Unimplemented;
  295. }
  296. return it->second(text) ? Status::Valid : Status::Invalid;
  297. }
  298. return Status::Unknown;
  299. }
  300. };
  301. }
  302. #undef CONSTRUCTS
  303. #undef UTF32