|
@@ -10,7 +10,8 @@
|
|
|
|
|
|
|
|
namespace string_utils {
|
|
namespace string_utils {
|
|
|
|
|
|
|
|
-tokenizer::tokenizer(std::string const & divider) : divider_(divider) {}
|
|
|
|
|
|
|
+tokenizer::tokenizer(std::string divider, struct quote quote)
|
|
|
|
|
+ : divider_(std::move(divider)), quote_(std::move(quote)) {}
|
|
|
|
|
|
|
|
tokenizer &tokenizer::max_outputs(size_t new_max_outputs) {
|
|
tokenizer &tokenizer::max_outputs(size_t new_max_outputs) {
|
|
|
max_outputs_ = new_max_outputs;
|
|
max_outputs_ = new_max_outputs;
|
|
@@ -37,36 +38,49 @@ static std::size_t countback(std::string const &str, std::size_t p, char c) {
|
|
|
return p - str.find_last_not_of(c, p - 1) - 1;
|
|
return p - str.find_last_not_of(c, p - 1) - 1;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-std::size_t tokenizer::find(std::string &input, std::size_t from) const {
|
|
|
|
|
- auto pos = input.find(divider_, from);
|
|
|
|
|
- while (escapable_ && pos != std::string::npos &&
|
|
|
|
|
- countback(input, pos, '\\') % 2) {
|
|
|
|
|
- input.erase(pos - 1, 1);
|
|
|
|
|
- pos = input.find(divider_, pos);
|
|
|
|
|
- }
|
|
|
|
|
- return pos;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-std::vector<std::string> tokenizer::operator()(std::string input) const {
|
|
|
|
|
|
|
+std::vector<std::string> tokenizer::operator()(std::string const &input) const {
|
|
|
|
|
+ auto equals_from = [&input](std::string const &token, std::size_t from) {
|
|
|
|
|
+ return token.size() + from < input.size() &&
|
|
|
|
|
+ std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
|
|
|
|
|
+ };
|
|
|
std::vector<std::string> rval;
|
|
std::vector<std::string> rval;
|
|
|
|
|
+ std::string buffer;
|
|
|
|
|
+ buffer.reserve(input.size());
|
|
|
// If max_outputs_ == infinite_outputs, this will be infinite enough to work
|
|
// If max_outputs_ == infinite_outputs, this will be infinite enough to work
|
|
|
// since we'll hit overflow on the string itself before this.
|
|
// since we'll hit overflow on the string itself before this.
|
|
|
- size_t const max = max_outputs_ - !truncate_;
|
|
|
|
|
- size_t i = 0;
|
|
|
|
|
- for (size_t n = find(input, i);
|
|
|
|
|
- n != std::string::npos && rval.size() < max;
|
|
|
|
|
- i = n + 1, n = find(input, i)) {
|
|
|
|
|
- if (ignore_empty_tokens_ && i == n) {
|
|
|
|
|
- continue;
|
|
|
|
|
|
|
+ std::size_t const max = max_outputs_ - !truncate_;
|
|
|
|
|
+ std::size_t const qsz = quote_.escaped.size();
|
|
|
|
|
+ std::size_t from = 0;
|
|
|
|
|
+ bool in_quote{false};
|
|
|
|
|
+ for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
|
|
|
|
|
+ // We check for escaped-quotes before we check for quotes to minimise
|
|
|
|
|
+ // complexity. Once in a quote, we simply append everything without checking
|
|
|
|
|
+ // for the divider until the end quote is encountered (escaped quotes are
|
|
|
|
|
+ // processed normally).
|
|
|
|
|
+ if (qsz > 0 && equals_from(quote_.escaped, pos)) {
|
|
|
|
|
+ buffer.append(1, quote_.on);
|
|
|
|
|
+ pos += qsz - 1;
|
|
|
|
|
+ } else if (input[pos] == quote_.on) {
|
|
|
|
|
+ in_quote = !in_quote;
|
|
|
|
|
+ } else if (in_quote || !equals_from(divider_, pos)) {
|
|
|
|
|
+ buffer.append(1, input[pos]);
|
|
|
|
|
+ } else if (escapable_ && countback(input, pos, '\\') % 2) {
|
|
|
|
|
+ buffer.back() = input[pos];
|
|
|
|
|
+ } else if (!in_quote) {
|
|
|
|
|
+ if (!ignore_empty_tokens_ || buffer.size()) {
|
|
|
|
|
+ rval.emplace_back(buffer);
|
|
|
|
|
+ }
|
|
|
|
|
+ from = pos + 1;
|
|
|
|
|
+ buffer.clear();
|
|
|
}
|
|
}
|
|
|
- rval.emplace_back(input.substr(i, n - i));
|
|
|
|
|
}
|
|
}
|
|
|
- // Special Handling for the final token
|
|
|
|
|
- if (ignore_empty_tokens_ && input.find(divider_, i) == i) {
|
|
|
|
|
- ++i;
|
|
|
|
|
|
|
+ // Due to the special handling rules of the truncate feature, we need
|
|
|
|
|
+ // to add an additional layer of handling around empty tokens and buffer
|
|
|
|
|
+ if (ignore_empty_tokens_ && equals_from(divider_, from)) {
|
|
|
|
|
+ ++from;
|
|
|
}
|
|
}
|
|
|
if (rval.size() < max_outputs_) {
|
|
if (rval.size() < max_outputs_) {
|
|
|
- rval.emplace_back(input.substr(i));
|
|
|
|
|
|
|
+ rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
|
|
|
}
|
|
}
|
|
|
return rval;
|
|
return rval;
|
|
|
}
|
|
}
|