|
@@ -32,64 +32,6 @@ inline size_t length(std::string_view arg) {
|
|
|
return ucs.countChar32();
|
|
return ucs.countChar32();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-/**
|
|
|
|
|
- * @brief Ensures that any codepoints/graphemes in the given regular expression
|
|
|
|
|
- * are wrapped in parenthesis in order to ensure that e.g. <PIRATE-EMOJI>*
|
|
|
|
|
- * properly matches the entire emoji multiple times, instead of just the last
|
|
|
|
|
- * byte of the string.
|
|
|
|
|
- *
|
|
|
|
|
- * Because we are only performing a regex search, and not matching/capturing
|
|
|
|
|
- * groups - we don't care that all of these extra parenthesis cause us to
|
|
|
|
|
- * generate new capture-groups or push some of the groups to a later point.
|
|
|
|
|
- *
|
|
|
|
|
- * @param arg A regular expression string, to be sanitized for UTF8 pattern-
|
|
|
|
|
- * matching.
|
|
|
|
|
- *
|
|
|
|
|
- * @returns The regular expression, with some more parenthesis added.
|
|
|
|
|
- */
|
|
|
|
|
-inline std::string regex_escape(std::string_view arg) {
|
|
|
|
|
- icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
|
|
|
|
|
- // Short-circuit if there are no multi-byte codepoints or graphemes, since
|
|
|
|
|
- // C++ regexes don't have any problems with those.
|
|
|
|
|
- if (ucs.countChar32() == arg.size()) {
|
|
|
|
|
- return std::string(arg);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
- // createCharacterInstance directly uses new - without any special allocation
|
|
|
|
|
- // rules or cleanup, since the first argument is NULL.
|
|
|
|
|
- std::unique_ptr<icu::BreakIterator> iter(
|
|
|
|
|
- icu::BreakIterator::createCharacterInstance(NULL, status));
|
|
|
|
|
-
|
|
|
|
|
- // This should never occur - unless there's like an alloc error
|
|
|
|
|
- if (U_FAILURE(status)) {
|
|
|
|
|
- return std::string(arg);
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- icu::UnicodeString rval;
|
|
|
|
|
- iter->setText(ucs);
|
|
|
|
|
- int32_t start = iter->first();
|
|
|
|
|
- int32_t end = iter->next();
|
|
|
|
|
- while (end != icu::BreakIterator::DONE) {
|
|
|
|
|
- // 0-or-1, 1-or-more, 0-or-more markings
|
|
|
|
|
- // This could be optimized to only operate when on a multibyte character
|
|
|
|
|
- if (std::strchr("?*+", ucs.charAt(end))) {
|
|
|
|
|
- rval.append('(');
|
|
|
|
|
- rval.append(ucs, start, end - start);
|
|
|
|
|
- rval.append(')');
|
|
|
|
|
- rval.append(ucs.char32At(end));
|
|
|
|
|
- end = iter->next();
|
|
|
|
|
- } else {
|
|
|
|
|
- rval.append(ucs, start, end - start);
|
|
|
|
|
- }
|
|
|
|
|
- start = end;
|
|
|
|
|
- end = iter->next();
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- std::string out;
|
|
|
|
|
- return rval.toUTF8String(out);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
inline std::string_view to_u8(std::string_view arg) { return arg; }
|
|
inline std::string_view to_u8(std::string_view arg) { return arg; }
|
|
|
|
|
|
|
|
inline std::string to_u8(std::u32string_view arg) {
|
|
inline std::string to_u8(std::u32string_view arg) {
|