/** * Utility functions for managing strings, specifically because C++'s * std::string/std::regex is not well suited for UTF8 comprehensions. */ #pragma once #include #include #include #if __has_include() #define JVALIDATE_HAS_ICU #include #include #endif #include #ifdef JVALIDATE_HAS_ICU namespace jvalidate::detail { /** * @brief Calclates the string-length of the argument, treating multi-byte * characters an unicode graphemes as single characters (which std::string * cannot do). * * @param arg Any UTF8 compatible string (including a standard ASCII string) * * @returns A number no greater than arg.size(), depending on the number of * graphemes/codepoints in the string. */ inline size_t length(std::string_view arg) { icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg)); return ucs.countChar32(); } inline std::string_view to_u8(std::string_view arg) { return arg; } inline std::string to_u8(std::u32string_view arg) { icu::UnicodeString const ucs = icu::UnicodeString::fromUTF32(reinterpret_cast(arg.data()), arg.size()); std::string out; return ucs.toUTF8String(out); } inline std::u32string_view to_u32(std::u32string_view arg) { return arg; } inline std::u32string to_u32(std::string_view arg) { icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg)); std::u32string rval; size_t const capacity = ucs.countChar32(); rval.resize(capacity); UErrorCode status = U_ZERO_ERROR; ucs.toUTF32(reinterpret_cast(rval.data()), capacity, status); // This should never occur - unless there's like an alloc error if (U_FAILURE(status)) { JVALIDATE_THROW(std::runtime_error, "UTF-32 Translation Error"); } return rval; } } namespace std { inline std::ostream & operator<<(std::ostream & os, std::u32string_view str) { return os << jvalidate::detail::to_u8(str); } } #else #endif