format.h 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. #pragma once
  2. #include <cctype>
  3. #include <cstddef>
  4. #include <ctime>
  5. #include <iostream>
  6. #include <string>
  7. #include <unordered_map>
  8. #include <utility>
  9. #include <jvalidate/forward.h>
  10. namespace jvalidate::format::detail {
  11. inline bool is_leapyear(int y) { return (y % 400) == 0 || ((y % 4) == 0 && (y % 100) != 0); }
  12. inline bool illegal_date(int y, int m, int d) {
  13. static constexpr int days[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
  14. if (is_leapyear(y) && m == 1) {
  15. --d;
  16. }
  17. return d > days[m];
  18. }
  19. inline auto date(std::string_view dt) {
  20. struct tm tm;
  21. if (auto end = strptime(dt.data(), "%Y-%m-%d", &tm); end) {
  22. if ((end - dt.data()) != 10 || illegal_date(tm.tm_year + 1900, tm.tm_mon, tm.tm_mday)) {
  23. return std::make_pair(0L, false);
  24. }
  25. return std::make_pair(end - dt.data(), true);
  26. }
  27. return std::make_pair(0L, false);
  28. }
  29. }
  30. namespace jvalidate::format {
  31. inline bool date(std::string_view dt) {
  32. auto [size, good] = detail::date(dt);
  33. return good && size == dt.size();
  34. }
  35. inline bool time(std::string_view dt) {
  36. struct tm tm;
  37. char const * end = strptime(dt.data(), "%T", &tm);
  38. if (end == nullptr || end == dt.end() || (end - dt.data()) < 8) {
  39. return false;
  40. }
  41. dt.remove_prefix(end - dt.begin());
  42. if (dt[0] == '.') {
  43. dt.remove_prefix(1);
  44. if (dt.empty() || not std::isdigit(dt[0])) {
  45. return false;
  46. }
  47. while (std::isdigit(dt[0])) {
  48. dt.remove_prefix(1);
  49. }
  50. }
  51. if (dt[0] == 'Z' || dt[0] == 'z') {
  52. return dt.size() == 1;
  53. }
  54. if (std::strchr("+-", dt[0])) {
  55. return strptime(dt.data() + 1, "%R", &tm) == dt.end();
  56. }
  57. return false;
  58. }
  59. inline bool date_time(std::string_view dt) {
  60. auto [size, good] = detail::date(dt);
  61. if (not good || std::strchr("Tt", dt[size]) == nullptr) {
  62. return false;
  63. }
  64. dt.remove_prefix(size + 1);
  65. return time(dt);
  66. }
  67. inline bool uuid(std::string_view id) {
  68. constexpr char const * g_hex_digits = "0123456789ABCDEFabcdef";
  69. constexpr size_t g_uuid_len = 36;
  70. constexpr size_t g_uuid_tokens = 5;
  71. char tok0[9], tok1[5], tok2[5], tok3[5], tok4[13];
  72. auto is_hex = [](std::string_view s) {
  73. return s.find_first_not_of(g_hex_digits) == std::string::npos;
  74. };
  75. return id.size() == g_uuid_len &&
  76. sscanf(id.data(), "%8s-%4s-%4s-%4s-%12s", tok0, tok1, tok2, tok3, tok4) == g_uuid_tokens &&
  77. is_hex(tok0) && is_hex(tok1) && is_hex(tok2) && is_hex(tok3) && is_hex(tok4);
  78. }
  79. inline bool duration(std::string_view dur) {
  80. auto eat = [&dur](std::string_view text) {
  81. char type;
  82. unsigned int rep;
  83. if (sscanf(dur.data(), "%u%c", &rep, &type) != 2 || text.find(type) == std::string::npos) {
  84. return std::string::npos;
  85. }
  86. dur.remove_prefix(dur.find(type) + 1);
  87. return text.find(type);
  88. };
  89. if (dur[0] != 'P' || dur.size() == 1) {
  90. return false;
  91. }
  92. dur.remove_prefix(1);
  93. if (dur[0] != 'T') {
  94. if (eat("W") != std::string::npos) {
  95. return dur.empty();
  96. }
  97. std::string_view ymd{"YMD"};
  98. while (not ymd.empty() && not dur.empty()) {
  99. if (size_t n = eat(ymd); n != std::string::npos) {
  100. ymd.remove_prefix(n + 1);
  101. } else {
  102. return false;
  103. }
  104. }
  105. if (dur.empty()) {
  106. return true;
  107. }
  108. }
  109. if (dur[0] != 'T' || dur.size() == 1) {
  110. return false;
  111. }
  112. dur.remove_prefix(1);
  113. std::string_view hms{"HMS"};
  114. while (not hms.empty() && not dur.empty()) {
  115. if (size_t n = eat(hms); n != std::string::npos) {
  116. hms.remove_prefix(n + 1);
  117. } else {
  118. return false;
  119. }
  120. }
  121. return dur.empty();
  122. }
  123. // Limitation - does not inspect graphemes, so it cannot check idn-hostname
  124. // to fix this - we'd need to
  125. inline bool hostname(std::string_view name) {
  126. auto hostname_part = [&name](size_t end) {
  127. if (end == 0 || end >= 64 || name[0] == '-' || name[end - 1] == '-') {
  128. return false;
  129. }
  130. for (size_t i = 0; i < end; ++i) {
  131. if (name[i] != '-' && not std::isalnum(name[i])) {
  132. return false;
  133. }
  134. }
  135. return true;
  136. };
  137. if (name.size() > (name.back() == '.' ? 254 : 253)) {
  138. return false;
  139. }
  140. for (size_t n = name.find('.'); n != std::string::npos;
  141. name.remove_prefix(n + 1), n = name.find('.')) {
  142. if (not hostname_part(n)) {
  143. return false;
  144. }
  145. }
  146. return name.empty() || hostname_part(name.size());
  147. }
  148. inline bool ipv4(std::string_view ip) {
  149. unsigned int ip0, ip1, ip2, ip3;
  150. char eof;
  151. if (ip.find_first_not_of("0123456789.") != std::string::npos) {
  152. return false;
  153. }
  154. if (ip[0] == '0' && std::isdigit(ip[1])) {
  155. return false;
  156. }
  157. if (size_t n = ip.find(".0"); n != std::string::npos && std::isdigit(ip[n + 2])) {
  158. return false;
  159. }
  160. if (sscanf(ip.data(), "%3u.%3u.%3u.%3u%c", &ip0, &ip1, &ip2, &ip3, &eof) != 4) {
  161. return false;
  162. }
  163. return ip0 <= 0xFF && ip1 <= 0xFF && ip2 <= 0xFF && ip3 <= 0xFF;
  164. }
  165. inline bool ipv6(std::string_view ip) {
  166. int expected_spans = 8;
  167. if (size_t n = ip.find('.'); n != std::string::npos) {
  168. if (not ipv4(ip.substr(ip.find_last_of(':') + 1))) {
  169. return false;
  170. }
  171. // This is a cheat to allow e.g. ::127.0.0.1 to validate
  172. expected_spans = 7;
  173. ip = ip.substr(0, n);
  174. }
  175. if (ip.find_first_not_of("0123456789ABCDEFabcdef:") != std::string::npos) {
  176. return false;
  177. }
  178. if (ip.size() >= 40) {
  179. return false;
  180. }
  181. bool has_compressed = false;
  182. int groups = 0;
  183. if (ip.starts_with("::")) {
  184. has_compressed = true;
  185. ip.remove_prefix(2);
  186. }
  187. while (!ip.empty()) {
  188. int data;
  189. if (sscanf(ip.data(), "%4x", &data) != 1) {
  190. return false;
  191. }
  192. if (size_t n = ip.find(':'); std::min(n, ip.size()) > 4) {
  193. return false;
  194. } else if (n != std::string::npos) {
  195. ip.remove_prefix(n + 1);
  196. } else {
  197. ip = "";
  198. }
  199. ++groups;
  200. if (ip[0] == ':') {
  201. if (std::exchange(has_compressed, true)) {
  202. return false;
  203. }
  204. ip.remove_prefix(1);
  205. }
  206. }
  207. return groups == expected_spans || (has_compressed && groups < expected_spans);
  208. }
  209. // Let's be honest - no matter what RFC 5321 §4.1.2 or RFC 6531 say, the only
  210. // way to know if an email address is valid is to try and send a message to it.
  211. // Therefore, there's no point in trying to validate things according to a
  212. // complex grammar - as long as it has an '@' sign with at least one character
  213. // on each side, we ought to call it an email.
  214. inline bool email(std::string_view em) {
  215. size_t n = em.find_last_of('@');
  216. if (n == 0 || n >= em.size() - 1) {
  217. return false;
  218. }
  219. if (em[0] == '"' && em[n - 1] == '"') {
  220. // No validation
  221. } else if (em.substr(0, n).find("..") != std::string::npos || em[n - 1] == '.' || em[0] == '.') {
  222. return false;
  223. }
  224. em.remove_prefix(n + 1);
  225. if (em.front() == '[' && em.back() == ']') {
  226. em.remove_prefix(1);
  227. em.remove_suffix(1);
  228. if (em.starts_with("IPv6:")) {
  229. return ipv6(std::string(em.substr(5)));
  230. }
  231. return ipv4(std::string(em)); // Re-acquire NULL-term
  232. }
  233. return hostname(em);
  234. }
  235. }
  236. namespace jvalidate {
  237. class FormatValidator {
  238. public:
  239. using Predicate = bool (*)(std::string_view);
  240. enum class Status { Unknown, Unimplemented, Valid, Invalid };
  241. private:
  242. std::unordered_map<std::string, Predicate> supported_formats_{
  243. {"date", &format::date},
  244. {"date-time", &format::date_time},
  245. {"duration", &format::duration},
  246. {"email", &format::email},
  247. {"hostname", &format::hostname},
  248. {"idn-email", nullptr},
  249. {"idn-hostname", nullptr},
  250. {"ipv4", &format::ipv4},
  251. {"ipv6", &format::ipv6},
  252. {"iri", nullptr},
  253. {"iri-reference", nullptr},
  254. {"json-pointer", nullptr},
  255. {"relative-json-pointer", nullptr},
  256. /* {"regex", &detail::StdRegexEngine::is_valid}, */
  257. {"time", &format::time},
  258. {"uri", nullptr},
  259. {"uri-reference", nullptr},
  260. {"uri-template", nullptr},
  261. {"uuid", &format::uuid},
  262. };
  263. public:
  264. FormatValidator() = default;
  265. Status operator()(std::string const & format, std::string_view text) const {
  266. if (auto it = supported_formats_.find(format); it != supported_formats_.end() && it->second) {
  267. if (not it->second) {
  268. return Status::Unimplemented;
  269. }
  270. return it->second(text) ? Status::Valid : Status::Invalid;
  271. }
  272. return Status::Unknown;
  273. }
  274. };
  275. }