Sam Jaffe 4 лет назад
Родитель
Сommit
ad57d0fb26
2 измененных файлов с 18 добавлено и 2 удалено
  1. 12 2
      src/tokenizer.cxx
  2. 6 0
      test/tokenizer_test.cxx

+ 12 - 2
src/tokenizer.cxx

@@ -44,11 +44,19 @@ std::vector<std::string> tokenizer::operator()(std::string const &input) const {
   buffer.reserve(input.size());
   // If max_outputs_ == infinite_outputs, this will be infinite enough to work
   // since we'll hit overflow on the string itself before this.
-  size_t const max = max_outputs_ - !truncate_;
+  std::size_t const max = max_outputs_ - !truncate_;
+  std::size_t const qsz = quote_.escaped.size();
   std::size_t from = 0;
   bool in_quote{false};
   for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
-    if (input[pos] == quote_.on) {
+    // We check for escaped-quotes before we check for quotes to minimise
+    // complexity. Once in a quote, we simply append everything without checking
+    // for the divider until the end quote is encountered (escaped quotes are
+    // processed normally).
+    if (qsz > 0 && input.find(quote_.escaped.c_str(), pos, qsz) == pos) {
+      buffer.append(1, quote_.on);
+      pos += qsz - 1;
+    } else if (input[pos] == quote_.on) {
       in_quote = !in_quote;
     } else if (input.find(divider_.c_str(), pos, divider_.size()) != pos ||
                in_quote) {
@@ -63,6 +71,8 @@ std::vector<std::string> tokenizer::operator()(std::string const &input) const {
       buffer.clear();
     }
   }
+  // Due to the special handling rules of the truncate feature, we need
+  // to add an additional layer of handling around empty tokens and buffer
   if (ignore_empty_tokens_ && input.find(divider_, from) == from) {
     ++from;
   }

+ 6 - 0
test/tokenizer_test.cxx

@@ -93,6 +93,12 @@ TEST(TokenizerTest, QuotedTokensAreEscapable) {
   EXPECT_THAT(tokenizer(",", {'"', "\\\""})(input), expected);
 }
 
+TEST(TokenizerTest, QuoteTokenLiteralIsApplicable) {
+  std::string const input = R"(A,"B"",C")";
+  std::vector<std::string> const expected{"A", "B\",C"};
+  EXPECT_THAT(tokenizer(",", {'"', "\"\""})(input), expected);
+}
+
 TEST(TokenizerTest, QuotesDontNeedToBeAtStartAndEnd) {
   std::string const input = R"(A,B",C")";
   std::vector<std::string> const expected{"A", "B,C"};