idna_special_cases.h 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #pragma once
  2. #include <string_view>
  3. #include <jvalidate/detail/string.h>
  4. #include <jvalidate/forward.h>
  5. namespace jvalidate::format::detail {
  6. template <typename CharT> struct special_case {
  7. std::basic_string_view<CharT> target;
  8. bool (*accepts_at)(std::basic_string_view<CharT>, size_t);
  9. bool accepts(std::basic_string_view<CharT> const str) const {
  10. for (size_t n = str.find_first_of(target); n != str.npos;
  11. n = str.find_first_of(target, n + 1)) {
  12. if (not accepts_at(str, n)) {
  13. return false;
  14. }
  15. }
  16. return true;
  17. }
  18. };
  19. }
  20. namespace jvalidate::format::detail {
  21. constexpr std::u32string_view g_exception_chars =
  22. U"\u00B7\u00DF\u0375\u03C2\u05F3\u05F4\u0640\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667"
  23. U"\u0668\u0669\u06F0\u06F1\u06F2\u06F3\u06F4\u06F5\u06F6\u06F7\u06F8\u06F9\u06FD\u06FE\u07FA"
  24. U"\u0F0B\u3007\u302E\u302F\u3031\u3032\u3033\u3034\u3035\u303B\u30FB";
  25. constexpr std::u32string_view g_exception_disallowed_chars =
  26. U"\u0640\u07FA\u302E\u302F\u3031\u3032\u3033\u3034\u3035\u303B";
  27. /**
  28. * @brief Determine if the character is DISALLOWED by special case in the
  29. * Exceptions (https://datatracker.ietf.org/doc/html/rfc5892#section-2.6)
  30. * table.
  31. */
  32. inline bool is_not_disallowed_exception(char32_t c) {
  33. return g_exception_disallowed_chars.find(c) == std::u32string_view::npos;
  34. }
  35. /**
  36. * @brief Determine if the character is in the "Greek" character range.
  37. */
  38. inline bool is_greek(char32_t c) { return c >= U'\u0370' && c <= U'\u03FF'; }
  39. /**
  40. * @brief Determine if the character is in the "Hebrew" character range.
  41. */
  42. inline bool is_hebrew(char32_t c) { return c >= U'\u0590' && c <= U'\u05FF'; }
  43. /**
  44. * @brief Determine if the character is in the "Han" (Kanji), "Hiragana", or
  45. * "Katakana" character ranges, excepting "KATAKANA MIDDLE DOT".
  46. */
  47. inline bool is_jp(char32_t c) {
  48. using P = std::pair<char32_t, char32_t>;
  49. constexpr std::array range{P{U'\u3040', U'\u30FA'}, P{U'\u30FC', U'\u30FF'},
  50. P{U'\u4E00', U'\u9FFF'}, P{U'\u3400', U'\u4DBF'},
  51. P{U'\U00020000', U'\U0002A6DF'}, P{U'\U0002A700', U'\U0002EBEF'},
  52. P{U'\U00030000', U'\U000323AF'}, P{U'\U0002EBF0', U'\U0002EE5F'},
  53. P{U'\U000323B0', U'\U0003347F'}, P{U'\uF900', U'\uFAFF'},
  54. P{U'\u2E80', U'\u303F'}, P{U'\u31C0', U'\u31EF'}};
  55. return std::ranges::any_of(range, [c](P p) { return c >= p.first && c <= p.second; });
  56. return (c >= U'\u3040' && c <= U'\u30FF') || (c >= U'\u4e00' && c <= U'\u9fff');
  57. }
  58. /**
  59. * @brief Determine if the character is an ASCII 'l' - required for handling
  60. * "MIDDLE DOT".
  61. */
  62. inline bool is_l_char(char32_t c) { return c == 'l'; }
  63. }
  64. namespace jvalidate::format::detail {
  65. template <auto F> constexpr auto char_before(std::u32string_view str, size_t n) {
  66. return n != 0 && F(str[n - 1]);
  67. }
  68. template <auto F> constexpr auto char_after(std::u32string_view str, size_t n) {
  69. return (n != str.size() - 1) && F(str[n + 1]);
  70. }
  71. template <auto F> constexpr auto before_or_after(std::u32string_view str, size_t n) {
  72. return char_before<F>(str, n) || char_after<F>(str, n);
  73. }
  74. template <auto F> constexpr auto before_and_after(std::u32string_view str, size_t n) {
  75. return char_before<F>(str, n) && char_after<F>(str, n);
  76. }
  77. template <auto F> constexpr auto any_other_char(std::u32string_view str, size_t n) {
  78. return std::any_of(str.begin(), str.begin() + n, F) ||
  79. std::any_of(str.begin() + n + 1, str.end(), F);
  80. }
  81. }
  82. namespace jvalidate::format::detail {
  83. using std::string_view_literals::operator""sv;
  84. template <typename CharT> struct char_delimiters;
  85. template <> struct char_delimiters<char> {
  86. static constexpr std::string_view hostname_part_delims{"."};
  87. static constexpr std::string_view dotdot{".."};
  88. static constexpr std::string_view double_slash{"//"};
  89. static constexpr std::string_view illegal_hostname_chars;
  90. static constexpr std::string_view punycode_prefix{"xn--"};
  91. static constexpr std::string_view illegal_dashes_ulabel{"--"};
  92. static constexpr std::array<special_case<char>, 0> special_cases;
  93. };
  94. template <> struct char_delimiters<char32_t> {
  95. static constexpr std::u32string_view hostname_part_delims{U".\u3002\uff0e\uff61"};
  96. static constexpr std::u32string_view dotdot{U".."};
  97. static constexpr std::u32string_view double_slash{U"//"};
  98. static constexpr std::u32string_view punycode_prefix{U"xn--"};
  99. static constexpr std::u32string_view illegal_dashes_ulabel{U"--"};
  100. static constexpr std::u32string_view illegal_hostname_chars{U"\u302E"};
  101. static constexpr std::array special_cases{
  102. special_case{U"\u0375"sv, char_after<is_greek>},
  103. special_case{U"\u05f3"sv, char_before<is_hebrew>},
  104. special_case{U"\u05f4"sv, char_before<is_hebrew>},
  105. special_case{U"\u00b7"sv, before_and_after<is_l_char>},
  106. special_case{U"\u30fb"sv, any_other_char<is_jp>},
  107. special_case{g_exception_chars, before_or_after<is_not_disallowed_exception>},
  108. };
  109. };
  110. template <typename CharT> bool is_special_case_ok(std::basic_string_view<CharT> name) {
  111. return std::ranges::all_of(detail::char_delimiters<CharT>::special_cases,
  112. [name](auto & sc) { return sc.accepts(name); });
  113. }
  114. }