/** * Utility functions for managing strings, specifically because C++'s * std::string/std::regex is not well suited for UTF8 comprehensions. */ #pragma once #if __has_include() #define JVALIDATE_HAS_ICU #include #include #endif #include namespace jvalidate::detail { /** * @brief Calclates the string-length of the argument, treating multi-byte * characters an unicode graphemes as single characters (which std::string * cannot do). * * @param arg Any UTF8 compatible string (including a standard ASCII string) * * @returns A number no greater than arg.size(), depending on the number of * graphemes/codepoints in the string. */ inline size_t length(std::string_view arg) { #ifdef JVALIDATE_HAS_ICU icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg)); return ucs.countChar32(); #else return arg.size(); #endif } /** * @brief Ensures that any codepoints/graphemes in the given regular expression * are wrapped in parenthesis in order to ensure that e.g. * * properly matches the entire emoji multiple times, instead of just the last * byte of the string. * * Because we are only performing a regex search, and not matching/capturing * groups - we don't care that all of these extra parenthesis cause us to * generate new capture-groups or push some of the groups to a later point. * * @param arg A regular expression string, to be sanitized for UTF8 pattern- * matching. * * @returns The regular expression, with some more parenthesis added. */ inline std::string regex_escape(std::string_view arg) { #ifdef JVALIDATE_HAS_ICU icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg)); // Short-circuit if there are no multi-byte codepoints or graphemes, since // C++ regexes don't have any problems with those. if (ucs.countChar32() == arg.size()) { return std::string(arg); } UErrorCode status = U_ZERO_ERROR; // createCharacterInstance directly uses new - without any special allocation // rules or cleanup, since the first argument is NULL. std::unique_ptr iter( icu::BreakIterator::createCharacterInstance(NULL, status)); // This should never occur - unless there's like an alloc error if (U_FAILURE(status)) { return std::string(arg); } icu::UnicodeString rval; iter->setText(ucs); int32_t start = iter->first(); int32_t end = iter->next(); while (end != icu::BreakIterator::DONE) { // 0-or-1, 1-or-more, 0-or-more markings // This could be optimized to only operate when on a multibyte character if (std::strchr("?*+", ucs.charAt(end))) { rval.append('('); rval.append(ucs, start, end - start); rval.append(')'); rval.append(ucs.char32At(end)); end = iter->next(); } else { rval.append(ucs, start, end - start); } start = end; end = iter->next(); } std::string out; return rval.toUTF8String(out); #else return std::string(arg); #endif } }