string.h 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. /**
  2. * Utility functions for managing strings, specifically because C++'s
  3. * std::string/std::regex is not well suited for UTF8 comprehensions.
  4. */
  5. #pragma once
  6. #include <string>
  7. #if __has_include(<unicode/std_string.h>)
  8. #define JVALIDATE_HAS_ICU
  9. #include <unicode/brkiter.h>
  10. #include <unicode/unistr.h>
  11. #endif
  12. #include <jvalidate/detail/expect.h>
  13. namespace jvalidate::detail {
  14. /**
  15. * @brief Calclates the string-length of the argument, treating multi-byte
  16. * characters an unicode graphemes as single characters (which std::string
  17. * cannot do).
  18. *
  19. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  20. *
  21. * @returns A number no greater than arg.size(), depending on the number of
  22. * graphemes/codepoints in the string.
  23. */
  24. inline size_t length(std::string_view arg) {
  25. #ifdef JVALIDATE_HAS_ICU
  26. icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  27. return ucs.countChar32();
  28. #else
  29. return arg.size();
  30. #endif
  31. }
  32. /**
  33. * @brief Ensures that any codepoints/graphemes in the given regular expression
  34. * are wrapped in parenthesis in order to ensure that e.g. <PIRATE-EMOJI>*
  35. * properly matches the entire emoji multiple times, instead of just the last
  36. * byte of the string.
  37. *
  38. * Because we are only performing a regex search, and not matching/capturing
  39. * groups - we don't care that all of these extra parenthesis cause us to
  40. * generate new capture-groups or push some of the groups to a later point.
  41. *
  42. * @param arg A regular expression string, to be sanitized for UTF8 pattern-
  43. * matching.
  44. *
  45. * @returns The regular expression, with some more parenthesis added.
  46. */
  47. inline std::string regex_escape(std::string_view arg) {
  48. #ifdef JVALIDATE_HAS_ICU
  49. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  50. // Short-circuit if there are no multi-byte codepoints or graphemes, since
  51. // C++ regexes don't have any problems with those.
  52. if (ucs.countChar32() == arg.size()) {
  53. return std::string(arg);
  54. }
  55. UErrorCode status = U_ZERO_ERROR;
  56. // createCharacterInstance directly uses new - without any special allocation
  57. // rules or cleanup, since the first argument is NULL.
  58. std::unique_ptr<icu::BreakIterator> iter(
  59. icu::BreakIterator::createCharacterInstance(NULL, status));
  60. // This should never occur - unless there's like an alloc error
  61. if (U_FAILURE(status)) {
  62. return std::string(arg);
  63. }
  64. icu::UnicodeString rval;
  65. iter->setText(ucs);
  66. int32_t start = iter->first();
  67. int32_t end = iter->next();
  68. while (end != icu::BreakIterator::DONE) {
  69. // 0-or-1, 1-or-more, 0-or-more markings
  70. // This could be optimized to only operate when on a multibyte character
  71. if (std::strchr("?*+", ucs.charAt(end))) {
  72. rval.append('(');
  73. rval.append(ucs, start, end - start);
  74. rval.append(')');
  75. rval.append(ucs.char32At(end));
  76. end = iter->next();
  77. } else {
  78. rval.append(ucs, start, end - start);
  79. }
  80. start = end;
  81. end = iter->next();
  82. }
  83. std::string out;
  84. return rval.toUTF8String(out);
  85. #else
  86. return std::string(arg);
  87. #endif
  88. }
  89. inline std::u32string to_u32(std::string_view arg) {
  90. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  91. std::u32string rval;
  92. size_t const capacity = ucs.countChar32();
  93. rval.resize(capacity);
  94. UErrorCode status = U_ZERO_ERROR;
  95. ucs.toUTF32(reinterpret_cast<int*>(rval.data()), capacity, status);
  96. // This should never occur - unless there's like an alloc error
  97. if (U_FAILURE(status)) {
  98. JVALIDATE_THROW(std::runtime_error, "UTF-32 Translation Error");
  99. }
  100. return rval;
  101. }
  102. }