string.h 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. /**
  2. * Utility functions for managing strings, specifically because C++'s
  3. * std::string/std::regex is not well suited for UTF8 comprehensions.
  4. */
  5. #pragma once
  6. #if __has_include(<unicode/std_string.h>)
  7. #define JVALIDATE_HAS_ICU
  8. #include <unicode/brkiter.h>
  9. #include <unicode/unistr.h>
  10. #endif
  11. #include <iostream>
  12. namespace jvalidate::detail {
  13. /**
  14. * @brief Calclates the string-length of the argument, treating multi-byte
  15. * characters an unicode graphemes as single characters (which std::string
  16. * cannot do).
  17. *
  18. * @param arg Any UTF8 compatible string (including a standard ASCII string)
  19. *
  20. * @returns A number no greater than arg.size(), depending on the number of
  21. * graphemes/codepoints in the string.
  22. */
  23. inline size_t length(std::string_view arg) {
  24. #ifdef JVALIDATE_HAS_ICU
  25. icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  26. return ucs.countChar32();
  27. #else
  28. return arg.size();
  29. #endif
  30. }
  31. /**
  32. * @brief Ensures that any codepoints/graphemes in the given regular expression
  33. * are wrapped in parenthesis in order to ensure that e.g. <PIRATE-EMOJI>*
  34. * properly matches the entire emoji multiple times, instead of just the last
  35. * byte of the string.
  36. *
  37. * Because we are only performing a regex search, and not matching/capturing
  38. * groups - we don't care that all of these extra parenthesis cause us to
  39. * generate new capture-groups or push some of the groups to a later point.
  40. *
  41. * @param arg A regular expression string, to be sanitized for UTF8 pattern-
  42. * matching.
  43. *
  44. * @returns The regular expression, with some more parenthesis added.
  45. */
  46. inline std::string regex_escape(std::string_view arg) {
  47. #ifdef JVALIDATE_HAS_ICU
  48. icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  49. // Short-circuit if there are no multi-byte codepoints or graphemes, since
  50. // C++ regexes don't have any problems with those.
  51. if (ucs.countChar32() == arg.size()) {
  52. return std::string(arg);
  53. }
  54. UErrorCode status = U_ZERO_ERROR;
  55. // createCharacterInstance directly uses new - without any special allocation
  56. // rules or cleanup, since the first argument is NULL.
  57. std::unique_ptr<icu::BreakIterator> iter(
  58. icu::BreakIterator::createCharacterInstance(NULL, status));
  59. // This should never occur - unless there's like an alloc error
  60. if (U_FAILURE(status)) {
  61. return std::string(arg);
  62. }
  63. icu::UnicodeString rval;
  64. iter->setText(ucs);
  65. int32_t start = iter->first();
  66. int32_t end = iter->next();
  67. while (end != icu::BreakIterator::DONE) {
  68. // 0-or-1, 1-or-more, 0-or-more markings
  69. // This could be optimized to only operate when on a multibyte character
  70. if (std::strchr("?*+", ucs.charAt(end))) {
  71. rval.append('(');
  72. rval.append(ucs, start, end - start);
  73. rval.append(')');
  74. rval.append(ucs.char32At(end));
  75. end = iter->next();
  76. } else {
  77. rval.append(ucs, start, end - start);
  78. }
  79. start = end;
  80. end = iter->next();
  81. }
  82. std::string out;
  83. return rval.toUTF8String(out);
  84. #else
  85. return std::string(arg);
  86. #endif
  87. }
  88. }