sjjaffe
/
json-validator


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
							/**
 * Utility functions for managing strings, specifically because C++'s
 * std::string/std::regex is not well suited for UTF8 comprehensions.
 */
#pragma once

#if __has_include(<unicode/std_string.h>)
#define JVALIDATE_HAS_ICU
#include <unicode/brkiter.h>
#include <unicode/unistr.h>
#endif
#include <iostream>

namespace jvalidate::detail {
/**
 * @brief Calclates the string-length of the argument, treating multi-byte
 * characters an unicode graphemes as single characters (which std::string
 * cannot do).
 *
 * @param arg Any UTF8 compatible string (including a standard ASCII string)
 *
 * @returns A number no greater than arg.size(), depending on the number of
 * graphemes/codepoints in the string.
 */
inline size_t length(std::string_view arg) {
#ifdef JVALIDATE_HAS_ICU
  icu::UnicodeString ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  return ucs.countChar32();
#else
  return arg.size();
#endif
}

/**
 * @brief Ensures that any codepoints/graphemes in the given regular expression
 * are wrapped in parenthesis in order to ensure that e.g. <PIRATE-EMOJI>*
 * properly matches the entire emoji multiple times, instead of just the last
 * byte of the string.
 *
 * Because we are only performing a regex search, and not matching/capturing
 * groups - we don't care that all of these extra parenthesis cause us to
 * generate new capture-groups or push some of the groups to a later point.
 *
 * @param arg A regular expression string, to be sanitized for UTF8 pattern-
 * matching.
 *
 * @returns The regular expression, with some more parenthesis added.
 */
inline std::string regex_escape(std::string_view arg) {
#ifdef JVALIDATE_HAS_ICU
  icu::UnicodeString const ucs = icu::UnicodeString::fromUTF8(icu::StringPiece(arg));
  // Short-circuit if there are no multi-byte codepoints or graphemes, since
  // C++ regexes don't have any problems with those.
  if (ucs.countChar32() == arg.size()) {
    return std::string(arg);
  }

  UErrorCode status = U_ZERO_ERROR;
  // createCharacterInstance directly uses new - without any special allocation
  // rules or cleanup, since the first argument is NULL.
  std::unique_ptr<icu::BreakIterator> iter(
      icu::BreakIterator::createCharacterInstance(NULL, status));

  // This should never occur - unless there's like an alloc error
  if (U_FAILURE(status)) {
    return std::string(arg);
  }

  icu::UnicodeString rval;
  iter->setText(ucs);
  int32_t start = iter->first();
  int32_t end = iter->next();
  while (end != icu::BreakIterator::DONE) {
    // 0-or-1, 1-or-more, 0-or-more markings
    // This could be optimized to only operate when on a multibyte character
    if (std::strchr("?*+", ucs.charAt(end))) {
      rval.append('(');
      rval.append(ucs, start, end - start);
      rval.append(')');
      rval.append(ucs.char32At(end));
      end = iter->next();
    } else {
      rval.append(ucs, start, end - start);
    }
    start = end;
    end = iter->next();
  }

  std::string out;
  return rval.toUTF8String(out);
#else
  return std::string(arg);
#endif
}
}