string.h 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. /**
  2. * Utility functions for managing strings, specifically because C++'s
  3. * std::string/std::regex is not well suited for UTF8 comprehensions.
  4. */
  5. #pragma once
  6. #include <jvalidate/_config.h>
  7. #include <cstring>
  8. #include <memory>
  9. #include <string_view>
  10. #if JVALIDATE_HAS_ICU
  11. #include <unicode/brkiter.h>
  12. #include <unicode/unistr.h>
  13. #endif
  14. #if JVALIDATE_HAS_IDNA
  15. #include <ada/idna/unicode_transcoding.h>
  16. #endif
  17. #include <jvalidate/detail/expect.h>
  18. namespace jvalidate::detail {
  19. inline std::string_view to_u8(std::string_view arg) { return arg; }
  20. inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
  21. }
  22. namespace jvalidate::detail {
  23. #if JVALIDATE_HAS_IDNA
  24. /**
  25. * @brief Calculates the string-length of the argument, treating multi-byte
  26. * characters as their individual bytes (as if the string was a std::string).
  27. *
  28. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  29. *
  30. * @returns A number no greater than arg.length(), depending on the number of
  31. * graphemes/codepoints in the string.
  32. */
  33. inline size_t length(std::string_view arg) {
  34. return ada::idna::utf32_length_from_utf8(arg.data(), arg.length());
  35. }
  36. inline std::string to_u8(std::u32string_view str) {
  37. auto data = std::make_unique_for_overwrite<char[]>(4 * str.length());
  38. size_t bytes = ada::idna::utf32_to_utf8(str.data(), str.length(), data.get());
  39. return std::string(data.get(), data.get() + bytes);
  40. }
  41. inline std::u32string to_u32(std::string_view str) {
  42. auto data = std::make_unique_for_overwrite<char32_t[]>(str.length());
  43. size_t bytes = ada::idna::utf8_to_utf32(str.data(), str.length(), data.get());
  44. return std::u32string(data.get(), data.get() + bytes);
  45. }
  46. #elif JVALIDATE_HAS_ICU
  47. /**
  48. * @brief Calculates the string-length of the argument, treating multi-byte
  49. * characters and unicode graphemes as single characters (which std::string
  50. * cannot do).
  51. *
  52. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  53. *
  54. * @returns A number no greater than arg.length(), depending on the number of
  55. * graphemes/codepoints in the string.
  56. */
  57. inline size_t length(std::string_view arg) {
  58. icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  59. return ucs.countChar32();
  60. }
  61. #else
  62. /**
  63. * @brief Calculates the string-length of the argument, without attempting to
  64. * parse out graphemes. This method is provided so that it is possible to use
  65. * maxLength and minLength constraints even when building without IDNA or ICU
  66. * in the toolchain.
  67. *
  68. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  69. *
  70. * @returns arg.length()
  71. */
  72. inline size_t length(std::string_view arg) { return arg.length(); }
  73. #endif
  74. }