|
|
@@ -3,68 +3,65 @@
|
|
|
* std::string/std::regex is not well suited for UTF8 comprehensions.
|
|
|
*/
|
|
|
#pragma once
|
|
|
+#include <jvalidate/_config.h>
|
|
|
|
|
|
-#include <ostream>
|
|
|
+#include <memory>
|
|
|
#include <string>
|
|
|
#include <string_view>
|
|
|
-#if __has_include(<unicode/std_string.h>)
|
|
|
-#define JVALIDATE_HAS_ICU
|
|
|
-#include <unicode/brkiter.h>
|
|
|
-#include <unicode/unistr.h>
|
|
|
+
|
|
|
+#ifdef JVALIDATE_HAS_IDNA
|
|
|
+#include <ada/idna/unicode_transcoding.h>
|
|
|
#endif
|
|
|
|
|
|
#include <jvalidate/detail/expect.h>
|
|
|
|
|
|
-#ifdef JVALIDATE_HAS_ICU
|
|
|
namespace jvalidate::detail {
|
|
|
+inline size_t length_u8(std::string_view arg) { return arg.length(); }
|
|
|
+inline size_t length_u32(std::u32string_view arg) { return arg.length(); }
|
|
|
+
|
|
|
+inline std::string_view to_u8(std::string_view arg) { return arg; }
|
|
|
+inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
|
|
|
+}
|
|
|
+
|
|
|
+#ifdef JVALIDATE_HAS_IDNA
|
|
|
+namespace jvalidate::detail {
|
|
|
+/**
|
|
|
+ * @brief Calclates the string-length of the argument, treating multi-byte
|
|
|
+ * characters as their individual bytes (as if the string was a std::string).
|
|
|
+ *
|
|
|
+ * @param arg A string encoded in UTF32
|
|
|
+ *
|
|
|
+ * @returns A number no greater than 4 * arg.length(), depending on the number
|
|
|
+ * of graphemes/codepoints in the string.
|
|
|
+ */
|
|
|
+inline size_t length_u8(std::u32string_view arg) {
|
|
|
+ return ada::idna::utf8_length_from_utf32(arg.data(), arg.length());
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* @brief Calclates the string-length of the argument, treating multi-byte
|
|
|
- * characters an unicode graphemes as single characters (which std::string
|
|
|
+ * characters and unicode graphemes as single characters (which std::string
|
|
|
* cannot do).
|
|
|
*
|
|
|
* @param arg Any UTF8 compatible string (including a standard ASCII string)
|
|
|
*
|
|
|
- * @returns A number no greater than arg.size(), depending on the number of
|
|
|
+ * @returns A number no greater than arg.length(), depending on the number of
|
|
|
* graphemes/codepoints in the string.
|
|
|
*/
|
|
|
-inline size_t length(std::string_view arg) {
|
|
|
- icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
|
|
|
- return ucs.countChar32();
|
|
|
-}
|
|
|
-
|
|
|
-inline std::string_view to_u8(std::string_view arg) { return arg; }
|
|
|
-
|
|
|
-inline std::string to_u8(std::u32string_view arg) {
|
|
|
- icu::UnicodeString const ucs =
|
|
|
- icu::UnicodeString::fromUTF32(reinterpret_cast<int const *>(arg.data()), arg.size());
|
|
|
- std::string out;
|
|
|
- return ucs.toUTF8String(out);
|
|
|
+inline size_t length_u32(std::string_view arg) {
|
|
|
+ return ada::idna::utf32_length_from_utf8(arg.data(), arg.length());
|
|
|
}
|
|
|
|
|
|
-inline std::u32string_view to_u32(std::u32string_view arg) { return arg; }
|
|
|
-
|
|
|
-inline std::u32string to_u32(std::string_view arg) {
|
|
|
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
|
|
|
-
|
|
|
- std::u32string rval;
|
|
|
- size_t const capacity = ucs.countChar32();
|
|
|
- rval.resize(capacity);
|
|
|
-
|
|
|
- UErrorCode status = U_ZERO_ERROR;
|
|
|
- ucs.toUTF32(reinterpret_cast<int *>(rval.data()), capacity, status);
|
|
|
- // This should never occur - unless there's like an alloc error
|
|
|
- if (U_FAILURE(status)) {
|
|
|
- JVALIDATE_THROW(std::runtime_error, "UTF-32 Translation Error");
|
|
|
- }
|
|
|
-
|
|
|
- return rval;
|
|
|
-}
|
|
|
+inline std::string to_u8(std::u32string_view str) {
|
|
|
+ auto data = std::make_unique_for_overwrite<char[]>(4 * str.length());
|
|
|
+ size_t bytes = ada::idna::utf32_to_utf8(str.data(), str.length(), data.get());
|
|
|
+ return std::string(data.get(), data.get() + bytes);
|
|
|
}
|
|
|
|
|
|
-namespace std {
|
|
|
-inline std::ostream & operator<<(std::ostream & os, std::u32string_view str) {
|
|
|
- return os << jvalidate::detail::to_u8(str);
|
|
|
+inline std::u32string to_u32(std::string_view str) {
|
|
|
+ auto data = std::make_unique_for_overwrite<char32_t[]>(str.length());
|
|
|
+ size_t bytes = ada::idna::utf8_to_utf32(str.data(), str.length(), data.get());
|
|
|
+ return std::u32string(data.get(), data.get() + bytes);
|
|
|
}
|
|
|
}
|
|
|
-#else
|
|
|
#endif
|