5 năm trước cách đây · 6908eff324
--- a/include/string_utils/tokenizer.h
+++ b/include/string_utils/tokenizer.h
@@ -16,25 +16,29 @@ namespace string_utils {
 
																 class tokenizer {
															
 
																 public:
															
 
																   static constexpr size_t const infinite_outputs{~size_t(0)};
															
 
																+  struct quote {
															
 
																+    char on;
															
 
																+    std::string escaped;
															
 
																+  };
															
 
																 private:
															
 
																   std::string divider_;
															
 
																+  quote quote_;
															
 
																   size_t max_outputs_{infinite_outputs};
															
 
																   bool truncate_{false};
															
 
																   bool ignore_empty_tokens_{true};
															
 
																   bool escapable_{false};
															
 
																 public:
															
 
																-  tokenizer(std::string const & divider);
															
 
																+  tokenizer(std::string divider, struct quote quote = {'\0', ""});
															
 
																   tokenizer &max_outputs(size_t new_max_outputs);
															
 
																   tokenizer &truncate(bool new_truncate_overage);
															
 
																   tokenizer &ignore_empty_tokens(bool new_ignore_empty_tokens);
															
 
																   tokenizer &escapable(bool new_escapable);
															
 
																-  std::vector<std::string> operator()(std::string input) const;
															
 
																+  std::vector<std::string> operator()(std::string const &input) const;
															
 
																 private:
															
 
																-  std::size_t find(std::string &input, std::size_t from) const;
															
 
																   size_t max_outputs() const;
															
 
																 };
															
--- a/src/tokenizer.cxx
+++ b/src/tokenizer.cxx
@@ -10,7 +10,8 @@
 
																 namespace string_utils {
															
 
																-tokenizer::tokenizer(std::string const & divider) : divider_(divider) {}
															
 
																+tokenizer::tokenizer(std::string divider, struct quote quote)
															
 
																+  : divider_(std::move(divider)), quote_(std::move(quote)) {}
															
 
																 tokenizer &tokenizer::max_outputs(size_t new_max_outputs) {
															
 
																   max_outputs_ = new_max_outputs;
															
@@ -37,36 +38,49 @@ static std::size_t countback(std::string const &str, std::size_t p, char c) {
 
																   return p - str.find_last_not_of(c, p - 1) - 1;
															
 
																 }
															
 
																-std::size_t tokenizer::find(std::string &input, std::size_t from) const {
															
 
																-  auto pos = input.find(divider_, from);
															
 
																-  while (escapable_ && pos != std::string::npos &&
															
 
																-         countback(input, pos, '\\') % 2) {
															
 
																-    input.erase(pos - 1, 1);
															
 
																-    pos = input.find(divider_, pos);
															
 
																-  }
															
 
																-  return pos;
															
 
																-}
															
 
																-
															
 
																-std::vector<std::string> tokenizer::operator()(std::string input) const {
															
 
																+std::vector<std::string> tokenizer::operator()(std::string const &input) const {
															
 
																+  auto equals_from = [&input](std::string const &token, std::size_t from) {
															
 
																+    return token.size() + from < input.size() &&
															
 
																+        std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
															
 
																+  };
															
 
																   std::vector<std::string> rval;
															
 
																+  std::string buffer;
															
 
																+  buffer.reserve(input.size());
															
 
																   // If max_outputs_ == infinite_outputs, this will be infinite enough to work
															
 
																   // since we'll hit overflow on the string itself before this.
															
 
																-  size_t const max = max_outputs_ - !truncate_;
															
 
																-  size_t i = 0;
															
 
																-  for (size_t n = find(input, i);
															
 
																-       n != std::string::npos && rval.size() < max;
															
 
																-       i = n + 1, n = find(input, i)) {
															
 
																-    if (ignore_empty_tokens_ && i == n) {
															
 
																-      continue;
															
 
																+  std::size_t const max = max_outputs_ - !truncate_;
															
 
																+  std::size_t const qsz = quote_.escaped.size();
															
 
																+  std::size_t from = 0;
															
 
																+  bool in_quote{false};
															
 
																+  for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
															
 
																+    // We check for escaped-quotes before we check for quotes to minimise
															
 
																+    // complexity. Once in a quote, we simply append everything without checking
															
 
																+    // for the divider until the end quote is encountered (escaped quotes are
															
 
																+    // processed normally).
															
 
																+    if (qsz > 0 && equals_from(quote_.escaped, pos)) {
															
 
																+      buffer.append(1, quote_.on);
															
 
																+      pos += qsz - 1;
															
 
																+    } else if (input[pos] == quote_.on) {
															
 
																+      in_quote = !in_quote;
															
 
																+    } else if (in_quote || !equals_from(divider_, pos)) {
															
 
																+      buffer.append(1, input[pos]);
															
 
																+    } else if (escapable_ && countback(input, pos, '\\') % 2) {
															
 
																+      buffer.back() = input[pos];
															
 
																+    } else if (!in_quote) {
															
 
																+      if (!ignore_empty_tokens_ || buffer.size()) {
															
 
																+        rval.emplace_back(buffer);
															
 
																+      }
															
 
																+      from = pos + 1;
															
 
																+      buffer.clear();
															
 
																     }
															
 
																-    rval.emplace_back(input.substr(i, n - i));
															
 
																   }
															
 
																-  // Special Handling for the final token
															
 
																-  if (ignore_empty_tokens_ && input.find(divider_, i) == i) {
															
 
																-    ++i;
															
 
																+  // Due to the special handling rules of the truncate feature, we need
															
 
																+  // to add an additional layer of handling around empty tokens and buffer
															
 
																+  if (ignore_empty_tokens_ && equals_from(divider_, from)) {
															
 
																+    ++from;
															
 
																   }
															
 
																   if (rval.size() < max_outputs_) {
															
 
																-    rval.emplace_back(input.substr(i));
															
 
																+    rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
															
 
																   }
															
 
																   return rval;
															
 
																 }
															
--- a/test/tokenizer_test.cxx
+++ b/test/tokenizer_test.cxx
@@ -64,13 +64,43 @@ TEST(TokenizerTest, MaxSizeWithEmptyCanResultInTokenWithDividerPrefix) {
 
																 }
															
 
																 TEST(TokenizerTest, EscapableTokensStickTogether) {
															
 
																-  std::string const input = R"(A.B\.C)";
															
 
																-  std::vector<std::string> const expected{"A", "B.C"};
															
 
																-  EXPECT_THAT(tokenizer(".").escapable(true)(input), expected);
															
 
																+  std::string const input = R"(A B\ C)";
															
 
																+  std::vector<std::string> const expected{"A", "B C"};
															
 
																+  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
															
 
																 }
															
 
																 TEST(TokenizerTest, CorrectlySplitsWhenEvenEscapes) {
															
 
																-  std::string const input = R"(A.B\\.C)";
															
 
																+  std::string const input = R"(A B\\ C)";
															
 
																   std::vector<std::string> const expected{"A", R"(B\\)", "C"};
															
 
																-  EXPECT_THAT(tokenizer(".").escapable(true)(input), expected);
															
 
																+  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
															
 
																+}
															
 
																+
															
 
																+TEST(TokenizerTest, QuotesAreDiscarded) {
															
 
																+  std::string const input = R"(A,"B",C)";
															
 
																+  std::vector<std::string> const expected{"A", "B", "C"};
															
 
																+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
															
 
																+}
															
 
																+
															
 
																+TEST(TokenizerTest, QuotedTokensStickTogether) {
															
 
																+  std::string const input = R"(A,"B,C")";
															
 
																+  std::vector<std::string> const expected{"A", "B,C"};
															
 
																+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
															
 
																+}
															
 
																+
															
 
																+TEST(TokenizerTest, QuotedTokensAreEscapable) {
															
 
																+  std::string const input = R"(A,"B\",C")";
															
 
																+  std::vector<std::string> const expected{"A", "B\",C"};
															
 
																+  EXPECT_THAT(tokenizer(",", {'"', "\\\""})(input), expected);
															
 
																+}
															
 
																+
															
 
																+TEST(TokenizerTest, QuoteTokenLiteralIsApplicable) {
															
 
																+  std::string const input = R"(A,"B"",C")";
															
 
																+  std::vector<std::string> const expected{"A", "B\",C"};
															
 
																+  EXPECT_THAT(tokenizer(",", {'"', "\"\""})(input), expected);
															
 
																+}
															
 
																+
															
 
																+TEST(TokenizerTest, QuotesDontNeedToBeAtStartAndEnd) {
															
 
																+  std::string const input = R"(A,B",C")";
															
 
																+  std::vector<std::string> const expected{"A", "B,C"};
															
 
																+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
															
 
																 }