| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- //
- // tokenizer.cxx
- // string-utils
- //
- // Created by Sam Jaffe on 10/8/20.
- // Copyright © 2020 Sam Jaffe. All rights reserved.
- //
- #include "string_utils/tokenizer.h"
- namespace {
- bool is_escaped(std::string_view str, std::size_t p) {
- if (p == 0 || str[p - 1] != '\\') { return false; }
- return ((p - str.find_last_not_of('\\', p - 1) - 1) & 1) == 1;
- }
- auto promote(std::vector<std::string_view> input) {
- return std::vector<std::string>(input.cbegin(), input.cend());
- }
- bool current_token_is(std::string_view token, size_t offset,
- std::string_view find) {
- return find.size() > 0 and token.compare(offset, find.size(), find) == 0;
- };
- }
- namespace string_utils {
- Tokenizer::Tokenizer(std::string_view divider) : divider_(divider) {}
- Tokenizer & Tokenizer::max_outputs(size_t new_max_outputs) {
- max_outputs_ = new_max_outputs;
- return *this;
- }
- Tokenizer & Tokenizer::truncate(bool new_truncate) {
- truncate_ = new_truncate;
- return *this;
- }
- Tokenizer & Tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
- ignore_empty_tokens_ = new_ignore_empty_tokens;
- return *this;
- }
- Tokenizer & Tokenizer::reverse_search(bool new_reverse) {
- reverse_search_ = new_reverse;
- return *this;
- }
- EscapedTokenizer::EscapedTokenizer(std::string_view divider, Quote quote)
- : EscapedTokenizer(Tokenizer(divider), quote) {}
- EscapedTokenizer::EscapedTokenizer(Tokenizer tok, Quote quote)
- : Tokenizer(tok) {
- this->quote(quote);
- }
- EscapedTokenizer & EscapedTokenizer::max_outputs(size_t new_max_outputs) {
- Tokenizer::max_outputs(new_max_outputs);
- return *this;
- }
- EscapedTokenizer & EscapedTokenizer::truncate(bool new_truncate) {
- Tokenizer::truncate(new_truncate);
- return *this;
- }
- EscapedTokenizer &
- EscapedTokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
- Tokenizer::ignore_empty_tokens(new_ignore_empty_tokens);
- return *this;
- }
- EscapedTokenizer & EscapedTokenizer::reverse_search(bool new_reverse) {
- Tokenizer::reverse_search(new_reverse);
- return *this;
- }
- Tokenizer & Tokenizer::quote(Quote quote) {
- quote_ = quote;
- escapable_ = true;
- return *this;
- }
- EscapedTokenizer Tokenizer::escapable(Quote quote) const {
- return EscapedTokenizer(*this, quote);
- }
- std::vector<std::string_view>
- Tokenizer::operator()(std::string_view input) const {
- std::vector<std::string_view> rval;
- // If max_outputs_ == infinite_outputs, this will be infinite enough to work
- // since we'll hit overflow on the string itself before this.
- std::size_t const max = max_outputs_ - !truncate_;
- std::size_t const qsz = quote_.escaped.size();
- size_t span = 0;
- auto index = [this, &input, &span]() {
- return reverse_search_ ? input.size() - span - 1 : span;
- };
- bool in_quote{false};
- while (not input.empty() and rval.size() < max and span == 0) {
- for (span = 0; span < input.size(); ++span) {
- // We check for escaped-quotes before we check for quotes to minimise
- // complexity. Once in a quote, we simply append everything without
- // checking for the divider until the end quote is encountered (escaped
- // quotes are processed normally).
- if (not quote_.escaped.empty() and
- current_token_is(input, index(), quote_.escaped)) {
- span += qsz - 1;
- } else if (input[index()] == quote_.on) {
- in_quote = !in_quote;
- } else if (in_quote or not current_token_is(input, index(), divider_)) {
- continue;
- } else if (escapable_ and is_escaped(input, index())) {
- continue;
- } else if (reverse_search_) {
- if (not ignore_empty_tokens_ or span > 0) {
- rval.push_back(input.substr(index() + 1, span));
- }
- input.remove_suffix(span + divider_.size());
- span = 0;
- break;
- } else {
- if (not ignore_empty_tokens_ or span > 0) {
- rval.push_back(input.substr(0, span));
- }
- input.remove_prefix(span + divider_.size());
- span = 0;
- break;
- }
- }
- }
- // Due to the special handling rules of the truncate feature, we need
- // to add an additional layer of handling around empty tokens and buffer
- if (ignore_empty_tokens_ and current_token_is(input, span, divider_)) {
- input.remove_prefix(divider_.size());
- }
- // Additionally, we do not want to include the final element if there is
- // actually no data remaining in the buffer/input string, even when we permit
- // empty tokens in our output.
- if (rval.size() < max_outputs_ and not input.empty()) {
- rval.push_back(input);
- }
- if (reverse_search_) { std::reverse(rval.begin(), rval.end()); }
- return rval;
- }
- std::string Tokenizer::unescape(std::string_view token) const {
- std::string rval;
- rval.reserve(token.size());
- for (size_t i = 0; i < token.size(); ++i) {
- /*if (current_token_is(token, i, "\\\\")) {
- rval.append(1, '\\');
- ++i;
- } else */
- // The order of these tests is important!
- // First we check if the current token is an escaped-quote - if so,
- // replace it with the regular quote.
- if (current_token_is(token, i, quote_.escaped)) {
- rval.append(1, quote_.on);
- } else if (token[i] == '\\' and current_token_is(token, i + 1, divider_)) {
- // Then we check if we're looking at an escaped divider
- rval.append(divider_);
- i += divider_.size();
- } else if (token[i] != quote_.on) {
- // Lastly - we remote unescaped quotes
- rval.append(1, token[i]);
- }
- }
- return rval;
- }
- std::vector<std::string> Tokenizer::operator()(std::string && input) const {
- return promote(operator()(std::string_view(input)));
- }
- std::vector<std::string>
- EscapedTokenizer::operator()(std::string_view input) const {
- auto rval = promote(Tokenizer::operator()(std::string_view(input)));
- for (auto & token : rval) {
- token = unescape(token);
- }
- return rval;
- }
- std::vector<std::string>
- EscapedTokenizer::operator()(std::string && input) const {
- return operator()(std::string_view(input));
- }
- }
|