|
|
@@ -8,103 +8,183 @@
|
|
|
|
|
|
#include "string_utils/tokenizer.h"
|
|
|
|
|
|
-namespace string_utils {
|
|
|
+namespace {
|
|
|
+bool is_escaped(std::string_view str, std::size_t p) {
|
|
|
+ if (p == 0 || str[p - 1] != '\\') { return false; }
|
|
|
+ return ((p - str.find_last_not_of('\\', p - 1) - 1) & 1) == 1;
|
|
|
+}
|
|
|
|
|
|
-template <typename C> static void reverse(C & str) {
|
|
|
- std::reverse(str.begin(), str.end());
|
|
|
+auto promote(std::vector<std::string_view> input) {
|
|
|
+ return std::vector<std::string>(input.cbegin(), input.cend());
|
|
|
}
|
|
|
|
|
|
-tokenizer::tokenizer(std::string divider, struct quote quote)
|
|
|
- : divider_(std::move(divider)), quote_(std::move(quote)) {}
|
|
|
+bool current_token_is(std::string_view token, size_t offset,
|
|
|
+ std::string_view find) {
|
|
|
+ return find.size() > 0 and token.compare(offset, find.size(), find) == 0;
|
|
|
+};
|
|
|
+}
|
|
|
+
|
|
|
+namespace string_utils {
|
|
|
|
|
|
-tokenizer & tokenizer::max_outputs(size_t new_max_outputs) {
|
|
|
+Tokenizer::Tokenizer(std::string_view divider) : divider_(divider) {}
|
|
|
+
|
|
|
+Tokenizer & Tokenizer::max_outputs(size_t new_max_outputs) {
|
|
|
max_outputs_ = new_max_outputs;
|
|
|
return *this;
|
|
|
}
|
|
|
|
|
|
-tokenizer & tokenizer::truncate(bool new_truncate) {
|
|
|
+Tokenizer & Tokenizer::truncate(bool new_truncate) {
|
|
|
truncate_ = new_truncate;
|
|
|
return *this;
|
|
|
}
|
|
|
|
|
|
-tokenizer & tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
|
|
|
+Tokenizer & Tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
|
|
|
ignore_empty_tokens_ = new_ignore_empty_tokens;
|
|
|
return *this;
|
|
|
}
|
|
|
|
|
|
-tokenizer & tokenizer::escapable(bool new_escapable) {
|
|
|
- escapable_ = new_escapable;
|
|
|
+Tokenizer & Tokenizer::reverse_search(bool new_reverse) {
|
|
|
+ reverse_search_ = new_reverse;
|
|
|
return *this;
|
|
|
}
|
|
|
|
|
|
-tokenizer & tokenizer::reverse_search(bool new_reverse) {
|
|
|
- if (reverse_search_ != new_reverse) {
|
|
|
- reverse(divider_);
|
|
|
- reverse(quote_.escaped);
|
|
|
- }
|
|
|
- reverse_search_ = new_reverse;
|
|
|
+EscapedTokenizer::EscapedTokenizer(std::string_view divider, Quote quote)
|
|
|
+ : EscapedTokenizer(Tokenizer(divider), quote) {}
|
|
|
+
|
|
|
+EscapedTokenizer::EscapedTokenizer(Tokenizer tok, Quote quote)
|
|
|
+ : Tokenizer(tok) {
|
|
|
+ this->quote(quote);
|
|
|
+}
|
|
|
+
|
|
|
+EscapedTokenizer & EscapedTokenizer::max_outputs(size_t new_max_outputs) {
|
|
|
+ Tokenizer::max_outputs(new_max_outputs);
|
|
|
return *this;
|
|
|
}
|
|
|
|
|
|
-static std::size_t countback(std::string const & str, std::size_t p, char c) {
|
|
|
- if (p == 0 || str[p - 1] != c) return 0;
|
|
|
- return p - str.find_last_not_of(c, p - 1) - 1;
|
|
|
+EscapedTokenizer & EscapedTokenizer::truncate(bool new_truncate) {
|
|
|
+ Tokenizer::truncate(new_truncate);
|
|
|
+ return *this;
|
|
|
}
|
|
|
|
|
|
-std::vector<std::string> tokenizer::operator()(std::string input) const {
|
|
|
- auto equals_from = [&input](std::string const & token, std::size_t from) {
|
|
|
- return token.size() + from < input.size() &&
|
|
|
- std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
|
|
|
- };
|
|
|
+EscapedTokenizer &
|
|
|
+EscapedTokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
|
|
|
+ Tokenizer::ignore_empty_tokens(new_ignore_empty_tokens);
|
|
|
+ return *this;
|
|
|
+}
|
|
|
|
|
|
- if (reverse_search_) { reverse(input); }
|
|
|
+EscapedTokenizer & EscapedTokenizer::reverse_search(bool new_reverse) {
|
|
|
+ Tokenizer::reverse_search(new_reverse);
|
|
|
+ return *this;
|
|
|
+}
|
|
|
|
|
|
- std::vector<std::string> rval;
|
|
|
- std::string buffer;
|
|
|
- buffer.reserve(input.size());
|
|
|
+Tokenizer & Tokenizer::quote(Quote quote) {
|
|
|
+ quote_ = quote;
|
|
|
+ escapable_ = true;
|
|
|
+ return *this;
|
|
|
+}
|
|
|
+
|
|
|
+EscapedTokenizer Tokenizer::escapable(Quote quote) const {
|
|
|
+ return EscapedTokenizer(*this, quote);
|
|
|
+}
|
|
|
+
|
|
|
+std::vector<std::string_view>
|
|
|
+Tokenizer::operator()(std::string_view input) const {
|
|
|
+ // if (reverse_search_) { reverse(input); }
|
|
|
+
|
|
|
+ std::vector<std::string_view> rval;
|
|
|
// If max_outputs_ == infinite_outputs, this will be infinite enough to work
|
|
|
// since we'll hit overflow on the string itself before this.
|
|
|
std::size_t const max = max_outputs_ - !truncate_;
|
|
|
std::size_t const qsz = quote_.escaped.size();
|
|
|
- std::size_t from = 0;
|
|
|
+ size_t span = 0;
|
|
|
bool in_quote{false};
|
|
|
- for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
|
|
|
- // We check for escaped-quotes before we check for quotes to minimise
|
|
|
- // complexity. Once in a quote, we simply append everything without checking
|
|
|
- // for the divider until the end quote is encountered (escaped quotes are
|
|
|
- // processed normally).
|
|
|
- if (qsz > 0 && equals_from(quote_.escaped, pos)) {
|
|
|
- buffer.append(1, quote_.on);
|
|
|
- pos += qsz - 1;
|
|
|
- } else if (input[pos] == quote_.on) {
|
|
|
- in_quote = !in_quote;
|
|
|
- } else if (in_quote || !equals_from(divider_, pos)) {
|
|
|
- buffer.append(1, input[pos]);
|
|
|
- } else if (escapable_ && countback(input, pos, '\\') % 2) {
|
|
|
- buffer.back() = input[pos];
|
|
|
- } else if (!in_quote) {
|
|
|
- if (!ignore_empty_tokens_ || buffer.size()) { rval.emplace_back(buffer); }
|
|
|
- from = pos + 1;
|
|
|
- buffer.clear();
|
|
|
+ while (not input.empty() and rval.size() < max and span == 0) {
|
|
|
+ for (span = 0; span < input.size(); ++span) {
|
|
|
+ // We check for escaped-quotes before we check for quotes to minimise
|
|
|
+ // complexity. Once in a quote, we simply append everything without
|
|
|
+ // checking for the divider until the end quote is encountered (escaped
|
|
|
+ // quotes are processed normally).
|
|
|
+ if (not quote_.escaped.empty() and
|
|
|
+ current_token_is(input, span, quote_.escaped)) {
|
|
|
+ span += qsz - 1;
|
|
|
+ } else if (input[span] == quote_.on) {
|
|
|
+ in_quote = !in_quote;
|
|
|
+ } else if (in_quote or not current_token_is(input, span, divider_)) {
|
|
|
+ continue;
|
|
|
+ } else if (escapable_ and is_escaped(input, span)) {
|
|
|
+ continue;
|
|
|
+ } else {
|
|
|
+ if (not ignore_empty_tokens_ or span > 0) {
|
|
|
+ rval.push_back(input.substr(0, span));
|
|
|
+ }
|
|
|
+ input.remove_prefix(span + divider_.size());
|
|
|
+ span = 0;
|
|
|
+ break;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
// Due to the special handling rules of the truncate feature, we need
|
|
|
// to add an additional layer of handling around empty tokens and buffer
|
|
|
- if (ignore_empty_tokens_ && equals_from(divider_, from)) { ++from; }
|
|
|
+ if (ignore_empty_tokens_ and current_token_is(input, span, divider_)) {
|
|
|
+ input.remove_prefix(divider_.size());
|
|
|
+ }
|
|
|
// Additionally, we do not want to include the final element if there is
|
|
|
// actually no data remaining in the buffer/input string, even when we permit
|
|
|
// empty tokens in our output.
|
|
|
- if (rval.size() < max_outputs_ && !(buffer.empty() && from == input.size())) {
|
|
|
- rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
|
|
|
+ if (rval.size() < max_outputs_ and not input.empty()) {
|
|
|
+ rval.emplace_back(input);
|
|
|
}
|
|
|
|
|
|
- if (reverse_search_) {
|
|
|
- reverse(rval);
|
|
|
- for (auto & str : rval) {
|
|
|
- reverse(str);
|
|
|
+ // if (reverse_search_) {
|
|
|
+ // reverse(rval);
|
|
|
+ // for (auto & str : rval) {
|
|
|
+ // reverse(str);
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ return rval;
|
|
|
+}
|
|
|
+
|
|
|
+std::string Tokenizer::unescape(std::string_view token) const {
|
|
|
+ std::string rval;
|
|
|
+ rval.reserve(token.size());
|
|
|
+ for (size_t i = 0; i < token.size(); ++i) {
|
|
|
+ /*if (current_token_is(token, i, "\\\\")) {
|
|
|
+ rval.append(1, '\\');
|
|
|
+ ++i;
|
|
|
+ } else */
|
|
|
+ // The order of these tests is important!
|
|
|
+ // First we check if the current token is an escaped-quote - if so,
|
|
|
+ // replace it with the regular quote.
|
|
|
+ if (current_token_is(token, i, quote_.escaped)) {
|
|
|
+ rval.append(1, quote_.on);
|
|
|
+ } else if (token[i] == '\\' and current_token_is(token, i + 1, divider_)) {
|
|
|
+ // Then we check if we're looking at an escaped divider
|
|
|
+ rval.append(divider_);
|
|
|
+ i += divider_.size();
|
|
|
+ } else if (token[i] != quote_.on) {
|
|
|
+ // Lastly - we remote unescaped quotes
|
|
|
+ rval.append(1, token[i]);
|
|
|
}
|
|
|
}
|
|
|
return rval;
|
|
|
}
|
|
|
|
|
|
+std::vector<std::string> Tokenizer::operator()(std::string && input) const {
|
|
|
+ return promote(operator()(std::string_view(input)));
|
|
|
+}
|
|
|
+
|
|
|
+std::vector<std::string>
|
|
|
+EscapedTokenizer::operator()(std::string_view input) const {
|
|
|
+ auto rval = promote(Tokenizer::operator()(std::string_view(input)));
|
|
|
+ for (auto & token : rval) {
|
|
|
+ token = unescape(token);
|
|
|
+ }
|
|
|
+ return rval;
|
|
|
+}
|
|
|
+
|
|
|
+std::vector<std::string>
|
|
|
+EscapedTokenizer::operator()(std::string && input) const {
|
|
|
+ return operator()(std::string_view(input));
|
|
|
+}
|
|
|
+
|
|
|
}
|