string.h 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. /**
  2. * Utility functions for managing strings, specifically because C++'s
  3. * std::string/std::regex is not well suited for UTF8 comprehensions.
  4. */
  5. #pragma once
  6. #include <jvalidate/_config.h>
  7. #include <cstring>
  8. #include <memory>
  9. #include <string_view>
  10. #if JVALIDATE_HAS_ICU
  11. #include <unicode/brkiter.h>
  12. #include <unicode/unistr.h>
  13. #endif
  14. #if JVALIDATE_HAS_IDNA
  15. #include <ada/idna/unicode_transcoding.h>
  16. #endif
  17. #include <jvalidate/detail/expect.h>
  18. namespace jvalidate::detail {
  19. inline size_t length_u8(std::string_view arg) { return arg.length(); }
  20. inline size_t length_u32(std::u32string_view arg) { return arg.length(); }
  21. inline std::string_view to_u8(std::string_view arg) { return arg; }
  22. inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
  23. }
  24. namespace jvalidate::detail {
  25. #if JVALIDATE_HAS_IDNA
  26. /**
  27. * @brief Calculates the string-length of the argument, treating multi-byte
  28. * characters as their individual bytes (as if the string was a std::string).
  29. *
  30. * @param arg A string encoded in UTF32
  31. *
  32. * @returns A number no greater than 4 * arg.length(), depending on the number
  33. * of graphemes/codepoints in the string.
  34. */
  35. inline size_t length_u8(std::u32string_view arg) {
  36. return ada::idna::utf8_length_from_utf32(arg.data(), arg.length());
  37. }
  38. /**
  39. * @brief Calculates the string-length of the argument, treating multi-byte
  40. * characters and unicode graphemes as single characters (which std::string
  41. * cannot do).
  42. *
  43. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  44. *
  45. * @returns A number no greater than arg.length(), depending on the number of
  46. * graphemes/codepoints in the string.
  47. */
  48. inline size_t length_u32(std::string_view arg) {
  49. return ada::idna::utf32_length_from_utf8(arg.data(), arg.length());
  50. }
  51. inline std::string to_u8(std::u32string_view str) {
  52. auto data = std::make_unique_for_overwrite<char[]>(4 * str.length());
  53. size_t bytes = ada::idna::utf32_to_utf8(str.data(), str.length(), data.get());
  54. return std::string(data.get(), data.get() + bytes);
  55. }
  56. inline std::u32string to_u32(std::string_view str) {
  57. auto data = std::make_unique_for_overwrite<char32_t[]>(str.length());
  58. size_t bytes = ada::idna::utf8_to_utf32(str.data(), str.length(), data.get());
  59. return std::u32string(data.get(), data.get() + bytes);
  60. }
  61. #elif JVALIDATE_HAS_ICU
  62. inline size_t length_u32(std::string_view arg) {
  63. icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  64. return ucs.countChar32();
  65. }
  66. #endif
  67. #if JVALIDATE_HAS_IDNA || JVALIDATE_HAS_ICU
  68. /**
  69. * @brief A proxy for jvalidate::detail::length_u32. This method is provided
  70. * so that it is possible to use maxLength and minLength constraints even when
  71. * building without IDNA or ICU in the toolchain.
  72. *
  73. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  74. *
  75. * @returns A number no greater than arg.length(), depending on the number of
  76. * graphemes/codepoints in the string.
  77. */
  78. inline size_t length(std::string_view arg) { return length_u32(arg); }
  79. #else
  80. /**
  81. * @brief Calculates the string-length of the argument, without attempting to
  82. * parse out graphemes. This method is provided so that it is possible to use
  83. * maxLength and minLength constraints even when building without IDNA or ICU
  84. * in the toolchain.
  85. *
  86. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  87. *
  88. * @returns arg.length()
  89. */
  90. inline size_t length(std::string_view arg) { return arg.length(); }
  91. #endif
  92. }