| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- /**
- * Utility functions for managing strings, specifically because C++'s
- * std::string/std::regex is not well suited for UTF8 comprehensions.
- */
- #pragma once
- #include <string>
- #if __has_include(<unicode/std_string.h>)
- #define JVALIDATE_HAS_ICU
- #include <unicode/brkiter.h>
- #include <unicode/unistr.h>
- #endif
- #include <jvalidate/detail/expect.h>
- #ifdef JVALIDATE_HAS_ICU
- namespace jvalidate::detail {
- /**
- * @brief Calclates the string-length of the argument, treating multi-byte
- * characters an unicode graphemes as single characters (which std::string
- * cannot do).
- *
- * @param arg Any UTF8 compatible string (including a standard ASCII string)
- *
- * @returns A number no greater than arg.size(), depending on the number of
- * graphemes/codepoints in the string.
- */
- inline size_t length(std::string_view arg) {
- icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
- return ucs.countChar32();
- }
- /**
- * @brief Ensures that any codepoints/graphemes in the given regular expression
- * are wrapped in parenthesis in order to ensure that e.g. <PIRATE-EMOJI>*
- * properly matches the entire emoji multiple times, instead of just the last
- * byte of the string.
- *
- * Because we are only performing a regex search, and not matching/capturing
- * groups - we don't care that all of these extra parenthesis cause us to
- * generate new capture-groups or push some of the groups to a later point.
- *
- * @param arg A regular expression string, to be sanitized for UTF8 pattern-
- * matching.
- *
- * @returns The regular expression, with some more parenthesis added.
- */
- inline std::string regex_escape(std::string_view arg) {
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
- // Short-circuit if there are no multi-byte codepoints or graphemes, since
- // C++ regexes don't have any problems with those.
- if (ucs.countChar32() == arg.size()) {
- return std::string(arg);
- }
- UErrorCode status = U_ZERO_ERROR;
- // createCharacterInstance directly uses new - without any special allocation
- // rules or cleanup, since the first argument is NULL.
- std::unique_ptr<icu::BreakIterator> iter(
- icu::BreakIterator::createCharacterInstance(NULL, status));
- // This should never occur - unless there's like an alloc error
- if (U_FAILURE(status)) {
- return std::string(arg);
- }
- icu::UnicodeString rval;
- iter->setText(ucs);
- int32_t start = iter->first();
- int32_t end = iter->next();
- while (end != icu::BreakIterator::DONE) {
- // 0-or-1, 1-or-more, 0-or-more markings
- // This could be optimized to only operate when on a multibyte character
- if (std::strchr("?*+", ucs.charAt(end))) {
- rval.append('(');
- rval.append(ucs, start, end - start);
- rval.append(')');
- rval.append(ucs.char32At(end));
- end = iter->next();
- } else {
- rval.append(ucs, start, end - start);
- }
- start = end;
- end = iter->next();
- }
- std::string out;
- return rval.toUTF8String(out);
- }
- inline std::u32string to_u32(std::string_view arg) {
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
- std::u32string rval;
- size_t const capacity = ucs.countChar32();
- rval.resize(capacity);
- UErrorCode status = U_ZERO_ERROR;
- ucs.toUTF32(reinterpret_cast<int *>(rval.data()), capacity, status);
- // This should never occur - unless there's like an alloc error
- if (U_FAILURE(status)) {
- JVALIDATE_THROW(std::runtime_error, "UTF-32 Translation Error");
- }
- return rval;
- }
- }
- #else
- #endif
|