// // tokenizer.cxx // string-utils // // Created by Sam Jaffe on 10/8/20. // Copyright © 2020 Sam Jaffe. All rights reserved. // #include "string_utils/tokenizer.h" namespace { bool is_escaped(std::string_view str, std::size_t p) { if (p == 0 || str[p - 1] != '\\') { return false; } return ((p - str.find_last_not_of('\\', p - 1) - 1) & 1) == 1; } auto promote(std::vector input) { return std::vector(input.cbegin(), input.cend()); } bool current_token_is(std::string_view token, size_t offset, std::string_view find) { return find.size() > 0 and token.compare(offset, find.size(), find) == 0; }; } namespace string_utils { Tokenizer::Tokenizer(std::string_view divider) : divider_(divider) {} Tokenizer & Tokenizer::max_outputs(size_t new_max_outputs) { max_outputs_ = new_max_outputs; return *this; } Tokenizer & Tokenizer::truncate(bool new_truncate) { truncate_ = new_truncate; return *this; } Tokenizer & Tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) { ignore_empty_tokens_ = new_ignore_empty_tokens; return *this; } Tokenizer & Tokenizer::reverse_search(bool new_reverse) { reverse_search_ = new_reverse; return *this; } EscapedTokenizer::EscapedTokenizer(std::string_view divider, Quote quote) : EscapedTokenizer(Tokenizer(divider), quote) {} EscapedTokenizer::EscapedTokenizer(Tokenizer tok, Quote quote) : Tokenizer(tok) { this->quote(quote); } EscapedTokenizer & EscapedTokenizer::max_outputs(size_t new_max_outputs) { Tokenizer::max_outputs(new_max_outputs); return *this; } EscapedTokenizer & EscapedTokenizer::truncate(bool new_truncate) { Tokenizer::truncate(new_truncate); return *this; } EscapedTokenizer & EscapedTokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) { Tokenizer::ignore_empty_tokens(new_ignore_empty_tokens); return *this; } EscapedTokenizer & EscapedTokenizer::reverse_search(bool new_reverse) { Tokenizer::reverse_search(new_reverse); return *this; } Tokenizer & Tokenizer::quote(Quote quote) { quote_ = quote; escapable_ = true; return *this; } EscapedTokenizer Tokenizer::escapable(Quote quote) const { return EscapedTokenizer(*this, quote); } std::vector Tokenizer::operator()(std::string_view input) const { std::vector rval; // If max_outputs_ == infinite_outputs, this will be infinite enough to work // since we'll hit overflow on the string itself before this. std::size_t const max = max_outputs_ - !truncate_; std::size_t const qsz = quote_.escaped.size(); size_t span = 0; auto index = [this, &input, &span]() { return reverse_search_ ? input.size() - span - 1 : span; }; bool in_quote{false}; while (not input.empty() and rval.size() < max and span == 0) { for (span = 0; span < input.size(); ++span) { // We check for escaped-quotes before we check for quotes to minimise // complexity. Once in a quote, we simply append everything without // checking for the divider until the end quote is encountered (escaped // quotes are processed normally). if (not quote_.escaped.empty() and current_token_is(input, index(), quote_.escaped)) { span += qsz - 1; } else if (input[index()] == quote_.on) { in_quote = !in_quote; } else if (in_quote or not current_token_is(input, index(), divider_)) { continue; } else if (escapable_ and is_escaped(input, index())) { continue; } else if (reverse_search_) { if (not ignore_empty_tokens_ or span > 0) { rval.push_back(input.substr(index() + 1, span)); } input.remove_suffix(span + divider_.size()); span = 0; break; } else { if (not ignore_empty_tokens_ or span > 0) { rval.push_back(input.substr(0, span)); } input.remove_prefix(span + divider_.size()); span = 0; break; } } } // Due to the special handling rules of the truncate feature, we need // to add an additional layer of handling around empty tokens and buffer if (ignore_empty_tokens_ and current_token_is(input, span, divider_)) { input.remove_prefix(divider_.size()); } // Additionally, we do not want to include the final element if there is // actually no data remaining in the buffer/input string, even when we permit // empty tokens in our output. if (rval.size() < max_outputs_ and not input.empty()) { rval.push_back(input); } if (reverse_search_) { std::reverse(rval.begin(), rval.end()); } return rval; } std::string Tokenizer::unescape(std::string_view token) const { std::string rval; rval.reserve(token.size()); for (size_t i = 0; i < token.size(); ++i) { /*if (current_token_is(token, i, "\\\\")) { rval.append(1, '\\'); ++i; } else */ // The order of these tests is important! // First we check if the current token is an escaped-quote - if so, // replace it with the regular quote. if (current_token_is(token, i, quote_.escaped)) { rval.append(1, quote_.on); } else if (token[i] == '\\' and current_token_is(token, i + 1, divider_)) { // Then we check if we're looking at an escaped divider rval.append(divider_); i += divider_.size(); } else if (token[i] != quote_.on) { // Lastly - we remote unescaped quotes rval.append(1, token[i]); } } return rval; } std::vector Tokenizer::operator()(std::string && input) const { return promote(operator()(std::string_view(input))); } std::vector EscapedTokenizer::operator()(std::string_view input) const { auto rval = promote(Tokenizer::operator()(std::string_view(input))); for (auto & token : rval) { token = unescape(token); } return rval; } std::vector EscapedTokenizer::operator()(std::string && input) const { return operator()(std::string_view(input)); } }