// // tokenizer.cxx // string-utils // // Created by Sam Jaffe on 10/8/20. // Copyright © 2020 Sam Jaffe. All rights reserved. // #include "string_utils/tokenizer.h" namespace string_utils { tokenizer::tokenizer(std::string divider, struct quote quote) : divider_(std::move(divider)), quote_(std::move(quote)) {} tokenizer & tokenizer::max_outputs(size_t new_max_outputs) { max_outputs_ = new_max_outputs; return *this; } tokenizer & tokenizer::truncate(bool new_truncate) { truncate_ = new_truncate; return *this; } tokenizer & tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) { ignore_empty_tokens_ = new_ignore_empty_tokens; return *this; } tokenizer & tokenizer::escapable(bool new_escapable) { escapable_ = new_escapable; return *this; } static std::size_t countback(std::string const & str, std::size_t p, char c) { if (p == 0 || str[p - 1] != c) return 0; return p - str.find_last_not_of(c, p - 1) - 1; } std::vector tokenizer::operator()(std::string const & input) const { auto equals_from = [&input](std::string const & token, std::size_t from) { return token.size() + from < input.size() && std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0; }; std::vector rval; std::string buffer; buffer.reserve(input.size()); // If max_outputs_ == infinite_outputs, this will be infinite enough to work // since we'll hit overflow on the string itself before this. std::size_t const max = max_outputs_ - !truncate_; std::size_t const qsz = quote_.escaped.size(); std::size_t from = 0; bool in_quote{false}; for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) { // We check for escaped-quotes before we check for quotes to minimise // complexity. Once in a quote, we simply append everything without checking // for the divider until the end quote is encountered (escaped quotes are // processed normally). if (qsz > 0 && equals_from(quote_.escaped, pos)) { buffer.append(1, quote_.on); pos += qsz - 1; } else if (input[pos] == quote_.on) { in_quote = !in_quote; } else if (in_quote || !equals_from(divider_, pos)) { buffer.append(1, input[pos]); } else if (escapable_ && countback(input, pos, '\\') % 2) { buffer.back() = input[pos]; } else if (!in_quote) { if (!ignore_empty_tokens_ || buffer.size()) { rval.emplace_back(buffer); } from = pos + 1; buffer.clear(); } } // Due to the special handling rules of the truncate feature, we need // to add an additional layer of handling around empty tokens and buffer if (ignore_empty_tokens_ && equals_from(divider_, from)) { ++from; } // Additionally, we do not want to include the final element if there is // actually no data remaining in the buffer/input string, even when we permit // empty tokens in our output. if (rval.size() < max_outputs_ && !(buffer.empty() && from == input.size())) { rval.emplace_back(buffer.empty() ? input.substr(from) : buffer); } return rval; } }