Explorar o código

Merge branch 'feat/tokenize/quoted'

* feat/tokenize/quoted:
  Find doesn't actually work that way, make it efficient.
  Pass tests.
  Pass non-escaped quote tests.
  Refactor tokenizer to use state machine/buffer.
  Adjust tests, better quote object
  Add tests for quoted strings (using csv style as is commonly used), change divider for escape tests to something better aligned to the use-case (spaces).
  Add quote string option to tokenizer.
Sam Jaffe %!s(int64=4) %!d(string=hai) anos
pai
achega
6908eff324
Modificáronse 3 ficheiros con 80 adicións e 32 borrados
  1. 7 3
      include/string_utils/tokenizer.h
  2. 38 24
      src/tokenizer.cxx
  3. 35 5
      test/tokenizer_test.cxx

+ 7 - 3
include/string_utils/tokenizer.h

@@ -16,25 +16,29 @@ namespace string_utils {
 class tokenizer {
 public:
   static constexpr size_t const infinite_outputs{~size_t(0)};
+  struct quote {
+    char on;
+    std::string escaped;
+  };
 private:
   std::string divider_;
+  quote quote_;
   size_t max_outputs_{infinite_outputs};
   bool truncate_{false};
   bool ignore_empty_tokens_{true};
   bool escapable_{false};
 
 public:
-  tokenizer(std::string const & divider);
+  tokenizer(std::string divider, struct quote quote = {'\0', ""});
     
   tokenizer &max_outputs(size_t new_max_outputs);
   tokenizer &truncate(bool new_truncate_overage);
   tokenizer &ignore_empty_tokens(bool new_ignore_empty_tokens);
   tokenizer &escapable(bool new_escapable);
 
-  std::vector<std::string> operator()(std::string input) const;
+  std::vector<std::string> operator()(std::string const &input) const;
   
 private:
-  std::size_t find(std::string &input, std::size_t from) const;
   size_t max_outputs() const;
 };
 

+ 38 - 24
src/tokenizer.cxx

@@ -10,7 +10,8 @@
 
 namespace string_utils {
 
-tokenizer::tokenizer(std::string const & divider) : divider_(divider) {}
+tokenizer::tokenizer(std::string divider, struct quote quote)
+  : divider_(std::move(divider)), quote_(std::move(quote)) {}
 
 tokenizer &tokenizer::max_outputs(size_t new_max_outputs) {
   max_outputs_ = new_max_outputs;
@@ -37,36 +38,49 @@ static std::size_t countback(std::string const &str, std::size_t p, char c) {
   return p - str.find_last_not_of(c, p - 1) - 1;
 }
 
-std::size_t tokenizer::find(std::string &input, std::size_t from) const {
-  auto pos = input.find(divider_, from);
-  while (escapable_ && pos != std::string::npos &&
-         countback(input, pos, '\\') % 2) {
-    input.erase(pos - 1, 1);
-    pos = input.find(divider_, pos);
-  }
-  return pos;
-}
-
-std::vector<std::string> tokenizer::operator()(std::string input) const {
+std::vector<std::string> tokenizer::operator()(std::string const &input) const {
+  auto equals_from = [&input](std::string const &token, std::size_t from) {
+    return token.size() + from < input.size() &&
+        std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
+  };
   std::vector<std::string> rval;
+  std::string buffer;
+  buffer.reserve(input.size());
   // If max_outputs_ == infinite_outputs, this will be infinite enough to work
   // since we'll hit overflow on the string itself before this.
-  size_t const max = max_outputs_ - !truncate_;
-  size_t i = 0;
-  for (size_t n = find(input, i);
-       n != std::string::npos && rval.size() < max;
-       i = n + 1, n = find(input, i)) {
-    if (ignore_empty_tokens_ && i == n) {
-      continue;
+  std::size_t const max = max_outputs_ - !truncate_;
+  std::size_t const qsz = quote_.escaped.size();
+  std::size_t from = 0;
+  bool in_quote{false};
+  for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
+    // We check for escaped-quotes before we check for quotes to minimise
+    // complexity. Once in a quote, we simply append everything without checking
+    // for the divider until the end quote is encountered (escaped quotes are
+    // processed normally).
+    if (qsz > 0 && equals_from(quote_.escaped, pos)) {
+      buffer.append(1, quote_.on);
+      pos += qsz - 1;
+    } else if (input[pos] == quote_.on) {
+      in_quote = !in_quote;
+    } else if (in_quote || !equals_from(divider_, pos)) {
+      buffer.append(1, input[pos]);
+    } else if (escapable_ && countback(input, pos, '\\') % 2) {
+      buffer.back() = input[pos];
+    } else if (!in_quote) {
+      if (!ignore_empty_tokens_ || buffer.size()) {
+        rval.emplace_back(buffer);
+      }
+      from = pos + 1;
+      buffer.clear();
     }
-    rval.emplace_back(input.substr(i, n - i));
   }
-  // Special Handling for the final token
-  if (ignore_empty_tokens_ && input.find(divider_, i) == i) {
-    ++i;
+  // Due to the special handling rules of the truncate feature, we need
+  // to add an additional layer of handling around empty tokens and buffer
+  if (ignore_empty_tokens_ && equals_from(divider_, from)) {
+    ++from;
   }
   if (rval.size() < max_outputs_) {
-    rval.emplace_back(input.substr(i));
+    rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
   }
   return rval;
 }

+ 35 - 5
test/tokenizer_test.cxx

@@ -64,13 +64,43 @@ TEST(TokenizerTest, MaxSizeWithEmptyCanResultInTokenWithDividerPrefix) {
 }
 
 TEST(TokenizerTest, EscapableTokensStickTogether) {
-  std::string const input = R"(A.B\.C)";
-  std::vector<std::string> const expected{"A", "B.C"};
-  EXPECT_THAT(tokenizer(".").escapable(true)(input), expected);
+  std::string const input = R"(A B\ C)";
+  std::vector<std::string> const expected{"A", "B C"};
+  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
 }
 
 TEST(TokenizerTest, CorrectlySplitsWhenEvenEscapes) {
-  std::string const input = R"(A.B\\.C)";
+  std::string const input = R"(A B\\ C)";
   std::vector<std::string> const expected{"A", R"(B\\)", "C"};
-  EXPECT_THAT(tokenizer(".").escapable(true)(input), expected);
+  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
+}
+
+TEST(TokenizerTest, QuotesAreDiscarded) {
+  std::string const input = R"(A,"B",C)";
+  std::vector<std::string> const expected{"A", "B", "C"};
+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
+}
+
+TEST(TokenizerTest, QuotedTokensStickTogether) {
+  std::string const input = R"(A,"B,C")";
+  std::vector<std::string> const expected{"A", "B,C"};
+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
+}
+
+TEST(TokenizerTest, QuotedTokensAreEscapable) {
+  std::string const input = R"(A,"B\",C")";
+  std::vector<std::string> const expected{"A", "B\",C"};
+  EXPECT_THAT(tokenizer(",", {'"', "\\\""})(input), expected);
+}
+
+TEST(TokenizerTest, QuoteTokenLiteralIsApplicable) {
+  std::string const input = R"(A,"B"",C")";
+  std::vector<std::string> const expected{"A", "B\",C"};
+  EXPECT_THAT(tokenizer(",", {'"', "\"\""})(input), expected);
+}
+
+TEST(TokenizerTest, QuotesDontNeedToBeAtStartAndEnd) {
+  std::string const input = R"(A,B",C")";
+  std::vector<std::string> const expected{"A", "B,C"};
+  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
 }