Jelajahi Sumber

Refactor tokenizer to use state machine/buffer.

Sam Jaffe 4 tahun lalu
induk
melakukan
263f9ed594
2 mengubah file dengan 20 tambahan dan 24 penghapusan
  1. 1 2
      include/string_utils/tokenizer.h
  2. 19 22
      src/tokenizer.cxx

+ 1 - 2
include/string_utils/tokenizer.h

@@ -36,10 +36,9 @@ public:
   tokenizer &ignore_empty_tokens(bool new_ignore_empty_tokens);
   tokenizer &escapable(bool new_escapable);
 
-  std::vector<std::string> operator()(std::string input) const;
+  std::vector<std::string> operator()(std::string const &input) const;
   
 private:
-  std::size_t find(std::string &input, std::size_t from) const;
   size_t max_outputs() const;
 };
 

+ 19 - 22
src/tokenizer.cxx

@@ -38,36 +38,33 @@ static std::size_t countback(std::string const &str, std::size_t p, char c) {
   return p - str.find_last_not_of(c, p - 1) - 1;
 }
 
-std::size_t tokenizer::find(std::string &input, std::size_t from) const {
-  auto pos = input.find(divider_, from);
-  while (escapable_ && pos != std::string::npos &&
-         countback(input, pos, '\\') % 2) {
-    input.erase(pos - 1, 1);
-    pos = input.find(divider_, pos);
-  }
-  return pos;
-}
-
-std::vector<std::string> tokenizer::operator()(std::string input) const {
+std::vector<std::string> tokenizer::operator()(std::string const &input) const {
   std::vector<std::string> rval;
+  std::string buffer;
+  buffer.reserve(input.size());
   // If max_outputs_ == infinite_outputs, this will be infinite enough to work
   // since we'll hit overflow on the string itself before this.
   size_t const max = max_outputs_ - !truncate_;
-  size_t i = 0;
-  for (size_t n = find(input, i);
-       n != std::string::npos && rval.size() < max;
-       i = n + 1, n = find(input, i)) {
-    if (ignore_empty_tokens_ && i == n) {
-      continue;
+  std::size_t from = 0;
+  bool in_quote{false};
+  for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
+    if (input.find(divider_.c_str(), pos, divider_.size()) != pos || in_quote) {
+      buffer.append(1, input[pos]);
+    } else if (escapable_ && countback(input, pos, '\\') % 2) {
+      buffer.back() = input[pos];
+    } else {
+      if (!ignore_empty_tokens_ || buffer.size()) {
+        rval.emplace_back(buffer);
+      }
+      from = pos + 1;
+      buffer.clear();
     }
-    rval.emplace_back(input.substr(i, n - i));
   }
-  // Special Handling for the final token
-  if (ignore_empty_tokens_ && input.find(divider_, i) == i) {
-    ++i;
+  if (ignore_empty_tokens_ && input.find(divider_, from) == from) {
+    ++from;
   }
   if (rval.size() < max_outputs_) {
-    rval.emplace_back(input.substr(i));
+    rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
   }
   return rval;
 }