%!s(int64=5) %!d(string=hai) anos · 6908eff324
--- a/include/string_utils/tokenizer.h
+++ b/include/string_utils/tokenizer.h
@@ -16,25 +16,29 @@ namespace string_utils {
 
				 class tokenizer {
			
 
				 public:
			
 
				   static constexpr size_t const infinite_outputs{~size_t(0)};
			
 
				+  struct quote {
			
 
				+    char on;
			
 
				+    std::string escaped;
			
 
				+  };
			
 
				 private:
			
 
				   std::string divider_;
			
 
				+  quote quote_;
			
 
				   size_t max_outputs_{infinite_outputs};
			
 
				   bool truncate_{false};
			
 
				   bool ignore_empty_tokens_{true};
			
 
				   bool escapable_{false};
			
 
				 
			
 
				 public:
			
 
				-  tokenizer(std::string const & divider);
			
 
				+  tokenizer(std::string divider, struct quote quote = {'\0', ""});
			
 
				     
			
 
				   tokenizer &max_outputs(size_t new_max_outputs);
			
 
				   tokenizer &truncate(bool new_truncate_overage);
			
 
				   tokenizer &ignore_empty_tokens(bool new_ignore_empty_tokens);
			
 
				   tokenizer &escapable(bool new_escapable);
			
 
				 
			
 
				-  std::vector<std::string> operator()(std::string input) const;
			
 
				+  std::vector<std::string> operator()(std::string const &input) const;
			
 
				   
			
 
				 private:
			
 
				-  std::size_t find(std::string &input, std::size_t from) const;
			
 
				   size_t max_outputs() const;
			
 
				 };
			
 
				 
			
--- a/src/tokenizer.cxx
+++ b/src/tokenizer.cxx
@@ -10,7 +10,8 @@
 
				 
			
 
				 namespace string_utils {
			
 
				 
			
 
				-tokenizer::tokenizer(std::string const & divider) : divider_(divider) {}
			
 
				+tokenizer::tokenizer(std::string divider, struct quote quote)
			
 
				+  : divider_(std::move(divider)), quote_(std::move(quote)) {}
			
 
				 
			
 
				 tokenizer &tokenizer::max_outputs(size_t new_max_outputs) {
			
 
				   max_outputs_ = new_max_outputs;
			
@@ -37,36 +38,49 @@ static std::size_t countback(std::string const &str, std::size_t p, char c) {
 
				   return p - str.find_last_not_of(c, p - 1) - 1;
			
 
				 }
			
 
				 
			
 
				-std::size_t tokenizer::find(std::string &input, std::size_t from) const {
			
 
				-  auto pos = input.find(divider_, from);
			
 
				-  while (escapable_ && pos != std::string::npos &&
			
 
				-         countback(input, pos, '\\') % 2) {
			
 
				-    input.erase(pos - 1, 1);
			
 
				-    pos = input.find(divider_, pos);
			
 
				-  }
			
 
				-  return pos;
			
 
				-}
			
 
				-
			
 
				-std::vector<std::string> tokenizer::operator()(std::string input) const {
			
 
				+std::vector<std::string> tokenizer::operator()(std::string const &input) const {
			
 
				+  auto equals_from = [&input](std::string const &token, std::size_t from) {
			
 
				+    return token.size() + from < input.size() &&
			
 
				+        std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
			
 
				+  };
			
 
				   std::vector<std::string> rval;
			
 
				+  std::string buffer;
			
 
				+  buffer.reserve(input.size());
			
 
				   // If max_outputs_ == infinite_outputs, this will be infinite enough to work
			
 
				   // since we'll hit overflow on the string itself before this.
			
 
				-  size_t const max = max_outputs_ - !truncate_;
			
 
				-  size_t i = 0;
			
 
				-  for (size_t n = find(input, i);
			
 
				-       n != std::string::npos && rval.size() < max;
			
 
				-       i = n + 1, n = find(input, i)) {
			
 
				-    if (ignore_empty_tokens_ && i == n) {
			
 
				-      continue;
			
 
				+  std::size_t const max = max_outputs_ - !truncate_;
			
 
				+  std::size_t const qsz = quote_.escaped.size();
			
 
				+  std::size_t from = 0;
			
 
				+  bool in_quote{false};
			
 
				+  for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
			
 
				+    // We check for escaped-quotes before we check for quotes to minimise
			
 
				+    // complexity. Once in a quote, we simply append everything without checking
			
 
				+    // for the divider until the end quote is encountered (escaped quotes are
			
 
				+    // processed normally).
			
 
				+    if (qsz > 0 && equals_from(quote_.escaped, pos)) {
			
 
				+      buffer.append(1, quote_.on);
			
 
				+      pos += qsz - 1;
			
 
				+    } else if (input[pos] == quote_.on) {
			
 
				+      in_quote = !in_quote;
			
 
				+    } else if (in_quote || !equals_from(divider_, pos)) {
			
 
				+      buffer.append(1, input[pos]);
			
 
				+    } else if (escapable_ && countback(input, pos, '\\') % 2) {
			
 
				+      buffer.back() = input[pos];
			
 
				+    } else if (!in_quote) {
			
 
				+      if (!ignore_empty_tokens_ || buffer.size()) {
			
 
				+        rval.emplace_back(buffer);
			
 
				+      }
			
 
				+      from = pos + 1;
			
 
				+      buffer.clear();
			
 
				     }
			
 
				-    rval.emplace_back(input.substr(i, n - i));
			
 
				   }
			
 
				-  // Special Handling for the final token
			
 
				-  if (ignore_empty_tokens_ && input.find(divider_, i) == i) {
			
 
				-    ++i;
			
 
				+  // Due to the special handling rules of the truncate feature, we need
			
 
				+  // to add an additional layer of handling around empty tokens and buffer
			
 
				+  if (ignore_empty_tokens_ && equals_from(divider_, from)) {
			
 
				+    ++from;
			
 
				   }
			
 
				   if (rval.size() < max_outputs_) {
			
 
				-    rval.emplace_back(input.substr(i));
			
 
				+    rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
			
 
				   }
			
 
				   return rval;
			
 
				 }
			
--- a/test/tokenizer_test.cxx
+++ b/test/tokenizer_test.cxx
@@ -64,13 +64,43 @@ TEST(TokenizerTest, MaxSizeWithEmptyCanResultInTokenWithDividerPrefix) {
 
				 }
			
 
				 
			
 
				 TEST(TokenizerTest, EscapableTokensStickTogether) {
			
 
				-  std::string const input = R"(A.B\.C)";
			
 
				-  std::vector<std::string> const expected{"A", "B.C"};
			
 
				-  EXPECT_THAT(tokenizer(".").escapable(true)(input), expected);
			
 
				+  std::string const input = R"(A B\ C)";
			
 
				+  std::vector<std::string> const expected{"A", "B C"};
			
 
				+  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
			
 
				 }
			
 
				 
			
 
				 TEST(TokenizerTest, CorrectlySplitsWhenEvenEscapes) {
			
 
				-  std::string const input = R"(A.B\\.C)";
			
 
				+  std::string const input = R"(A B\\ C)";
			
 
				   std::vector<std::string> const expected{"A", R"(B\\)", "C"};
			
 
				-  EXPECT_THAT(tokenizer(".").escapable(true)(input), expected);
			
 
				+  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
			
 
				+}
			
 
				+
			
 
				+TEST(TokenizerTest, QuotesAreDiscarded) {
			
 
				+  std::string const input = R"(A,"B",C)";
			
 
				+  std::vector<std::string> const expected{"A", "B", "C"};
			
 
				+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
			
 
				+}
			
 
				+
			
 
				+TEST(TokenizerTest, QuotedTokensStickTogether) {
			
 
				+  std::string const input = R"(A,"B,C")";
			
 
				+  std::vector<std::string> const expected{"A", "B,C"};
			
 
				+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
			
 
				+}
			
 
				+
			
 
				+TEST(TokenizerTest, QuotedTokensAreEscapable) {
			
 
				+  std::string const input = R"(A,"B\",C")";
			
 
				+  std::vector<std::string> const expected{"A", "B\",C"};
			
 
				+  EXPECT_THAT(tokenizer(",", {'"', "\\\""})(input), expected);
			
 
				+}
			
 
				+
			
 
				+TEST(TokenizerTest, QuoteTokenLiteralIsApplicable) {
			
 
				+  std::string const input = R"(A,"B"",C")";
			
 
				+  std::vector<std::string> const expected{"A", "B\",C"};
			
 
				+  EXPECT_THAT(tokenizer(",", {'"', "\"\""})(input), expected);
			
 
				+}
			
 
				+
			
 
				+TEST(TokenizerTest, QuotesDontNeedToBeAtStartAndEnd) {
			
 
				+  std::string const input = R"(A,B",C")";
			
 
				+  std::vector<std::string> const expected{"A", "B,C"};
			
 
				+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
			
 
				 }