Переглянути джерело

Make tokenizer support string_view.

Sam Jaffe 2 роки тому
батько
коміт
6c5dfe4bee

+ 19 - 0
README.md

@@ -0,0 +1,19 @@
+#  String Utilities in C++
+
+A couple of utilities for improving string usability
+
+## Join
+
+Concatenate the elements of a container with a joining token. Uses ostreams.
+
+## Tokenizer/Split
+
+Split a string into a vector of strings. There are two different versions of the tokenizer: normal 
+and escapable. The EscapableTokenizer cannot return string\_views, because it may have to doctor 
+the contents.
+
+## Cast - Coercing types from strings
+
+In GoogleMock, if you don't want to define an ostream operator for your type, you can define a 
+function `PrintTo(T const &, std::ostream*)` in the same namespace as `T`. GoogleMock then uses ADL 
+to find that function and use it to print out the formatted version.

+ 12 - 4
include/string_utils/forwards.h

@@ -11,14 +11,22 @@
 #include "string_utils/traits.h"
 
 namespace string_utils {
+
+class Tokenizer;
+class EscapedTokenizer;
+
 // A helper object for providing partial specializations for casting
 template <typename, typename = void> struct cast_helper;
 
 // The main parser
 template <typename T> std::pair<T, bool> cast(std::string_view str) noexcept;
-template <typename T, typename S> std::pair<T, bool> cast(std::vector<S> const &str) noexcept;
+template <typename T, typename S>
+std::pair<T, bool> cast(std::vector<S> const & str) noexcept;
 
-template <typename S, typename T,
-          typename = std::enable_if_t<detail::has_result<cast_helper<T>(S, T&)>{}>>
-bool cast(S const &str, T & to) noexcept { return cast_helper<T>{}(str, to); }
+template <
+    typename S, typename T,
+    typename = std::enable_if_t<detail::has_result<cast_helper<T>(S, T &)>{}>>
+bool cast(S const & str, T & to) noexcept {
+  return cast_helper<T>{}(str, to);
+}
 }

+ 41 - 20
include/string_utils/tokenizer.h

@@ -11,48 +11,69 @@
 #include <string>
 #include <vector>
 
+#include "string_utils/forwards.h"
+
 namespace string_utils {
 
-class tokenizer {
+class Tokenizer {
 public:
-  static constexpr size_t const infinite_outputs{~size_t(0)};
-  struct quote {
+  static constexpr size_t const UNLIMITED = ~0ul;
+
+protected:
+  struct Quote {
     char on;
-    std::string escaped;
+    std::string_view escaped;
   };
 
 private:
-  std::string divider_;
-  quote quote_;
-  size_t max_outputs_{infinite_outputs};
+  std::string_view divider_;
+  Quote quote_{'\0', ""};
+  size_t max_outputs_{UNLIMITED};
   bool truncate_{false};
   bool ignore_empty_tokens_{true};
   bool escapable_{false};
   bool reverse_search_{false};
 
 public:
-  tokenizer(std::string divider, struct quote quote = {'\0', ""});
+  explicit Tokenizer(std::string_view divider);
 
-  tokenizer & max_outputs(size_t new_max_outputs);
-  tokenizer & truncate(bool new_truncate_overage);
-  tokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
-  tokenizer & escapable(bool new_escapable);
-  tokenizer & reverse_search(bool new_reverse);
+  Tokenizer & max_outputs(size_t new_max_outputs);
+  Tokenizer & truncate(bool new_truncate_overage);
+  Tokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
+  Tokenizer & reverse_search(bool new_reverse);
+  EscapedTokenizer escapable(Quote quote = Quote{'\0', ""}) const;
 
-  std::vector<std::string> operator()(std::string input) const;
+  std::vector<std::string> operator()(std::string && input) const;
+  std::vector<std::string_view> operator()(std::string_view input) const;
 
-private:
-  size_t max_outputs() const;
+protected:
+  Tokenizer & quote(Quote quote);
+  std::string unescape(std::string_view token) const;
+};
+
+class EscapedTokenizer : public Tokenizer {
+public:
+  explicit EscapedTokenizer(std::string_view divider,
+                            Quote quote = Quote{'\0', ""});
+  explicit EscapedTokenizer(Tokenizer impl, Quote quote);
+
+  EscapedTokenizer & max_outputs(size_t new_max_outputs);
+  EscapedTokenizer & truncate(bool new_truncate_overage);
+  EscapedTokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
+  EscapedTokenizer & reverse_search(bool new_reverse);
+
+  std::vector<std::string> operator()(std::string && input) const;
+  std::vector<std::string> operator()(std::string_view input) const;
 };
 
 inline auto split(std::string const & data, std::string const & on,
-                  size_t max = tokenizer::infinite_outputs) {
-  return tokenizer{on}.max_outputs(max)(data);
+                  size_t max = Tokenizer::UNLIMITED) {
+  return Tokenizer{on}.max_outputs(max)(data);
 }
 
 inline auto rsplit(std::string const & data, std::string const & on,
-                   size_t max = tokenizer::infinite_outputs) {
-  return tokenizer{on}.reverse_search(true).max_outputs(max)(data);
+                   size_t max = Tokenizer::UNLIMITED) {
+  return Tokenizer{on}.reverse_search(true).max_outputs(max)(data);
 }
 
 }

+ 134 - 54
src/tokenizer.cxx

@@ -8,103 +8,183 @@
 
 #include "string_utils/tokenizer.h"
 
-namespace string_utils {
+namespace {
+bool is_escaped(std::string_view str, std::size_t p) {
+  if (p == 0 || str[p - 1] != '\\') { return false; }
+  return ((p - str.find_last_not_of('\\', p - 1) - 1) & 1) == 1;
+}
 
-template <typename C> static void reverse(C & str) {
-  std::reverse(str.begin(), str.end());
+auto promote(std::vector<std::string_view> input) {
+  return std::vector<std::string>(input.cbegin(), input.cend());
 }
 
-tokenizer::tokenizer(std::string divider, struct quote quote)
-    : divider_(std::move(divider)), quote_(std::move(quote)) {}
+bool current_token_is(std::string_view token, size_t offset,
+                      std::string_view find) {
+  return find.size() > 0 and token.compare(offset, find.size(), find) == 0;
+};
+}
+
+namespace string_utils {
 
-tokenizer & tokenizer::max_outputs(size_t new_max_outputs) {
+Tokenizer::Tokenizer(std::string_view divider) : divider_(divider) {}
+
+Tokenizer & Tokenizer::max_outputs(size_t new_max_outputs) {
   max_outputs_ = new_max_outputs;
   return *this;
 }
 
-tokenizer & tokenizer::truncate(bool new_truncate) {
+Tokenizer & Tokenizer::truncate(bool new_truncate) {
   truncate_ = new_truncate;
   return *this;
 }
 
-tokenizer & tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
+Tokenizer & Tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
   ignore_empty_tokens_ = new_ignore_empty_tokens;
   return *this;
 }
 
-tokenizer & tokenizer::escapable(bool new_escapable) {
-  escapable_ = new_escapable;
+Tokenizer & Tokenizer::reverse_search(bool new_reverse) {
+  reverse_search_ = new_reverse;
   return *this;
 }
 
-tokenizer & tokenizer::reverse_search(bool new_reverse) {
-  if (reverse_search_ != new_reverse) {
-    reverse(divider_);
-    reverse(quote_.escaped);
-  }
-  reverse_search_ = new_reverse;
+EscapedTokenizer::EscapedTokenizer(std::string_view divider, Quote quote)
+    : EscapedTokenizer(Tokenizer(divider), quote) {}
+
+EscapedTokenizer::EscapedTokenizer(Tokenizer tok, Quote quote)
+    : Tokenizer(tok) {
+  this->quote(quote);
+}
+
+EscapedTokenizer & EscapedTokenizer::max_outputs(size_t new_max_outputs) {
+  Tokenizer::max_outputs(new_max_outputs);
   return *this;
 }
 
-static std::size_t countback(std::string const & str, std::size_t p, char c) {
-  if (p == 0 || str[p - 1] != c) return 0;
-  return p - str.find_last_not_of(c, p - 1) - 1;
+EscapedTokenizer & EscapedTokenizer::truncate(bool new_truncate) {
+  Tokenizer::truncate(new_truncate);
+  return *this;
 }
 
-std::vector<std::string> tokenizer::operator()(std::string input) const {
-  auto equals_from = [&input](std::string const & token, std::size_t from) {
-    return token.size() + from < input.size() &&
-           std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
-  };
+EscapedTokenizer &
+EscapedTokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
+  Tokenizer::ignore_empty_tokens(new_ignore_empty_tokens);
+  return *this;
+}
 
-  if (reverse_search_) { reverse(input); }
+EscapedTokenizer & EscapedTokenizer::reverse_search(bool new_reverse) {
+  Tokenizer::reverse_search(new_reverse);
+  return *this;
+}
 
-  std::vector<std::string> rval;
-  std::string buffer;
-  buffer.reserve(input.size());
+Tokenizer & Tokenizer::quote(Quote quote) {
+  quote_ = quote;
+  escapable_ = true;
+  return *this;
+}
+
+EscapedTokenizer Tokenizer::escapable(Quote quote) const {
+  return EscapedTokenizer(*this, quote);
+}
+
+std::vector<std::string_view>
+Tokenizer::operator()(std::string_view input) const {
+  //  if (reverse_search_) { reverse(input); }
+
+  std::vector<std::string_view> rval;
   // If max_outputs_ == infinite_outputs, this will be infinite enough to work
   // since we'll hit overflow on the string itself before this.
   std::size_t const max = max_outputs_ - !truncate_;
   std::size_t const qsz = quote_.escaped.size();
-  std::size_t from = 0;
+  size_t span = 0;
   bool in_quote{false};
-  for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
-    // We check for escaped-quotes before we check for quotes to minimise
-    // complexity. Once in a quote, we simply append everything without checking
-    // for the divider until the end quote is encountered (escaped quotes are
-    // processed normally).
-    if (qsz > 0 && equals_from(quote_.escaped, pos)) {
-      buffer.append(1, quote_.on);
-      pos += qsz - 1;
-    } else if (input[pos] == quote_.on) {
-      in_quote = !in_quote;
-    } else if (in_quote || !equals_from(divider_, pos)) {
-      buffer.append(1, input[pos]);
-    } else if (escapable_ && countback(input, pos, '\\') % 2) {
-      buffer.back() = input[pos];
-    } else if (!in_quote) {
-      if (!ignore_empty_tokens_ || buffer.size()) { rval.emplace_back(buffer); }
-      from = pos + 1;
-      buffer.clear();
+  while (not input.empty() and rval.size() < max and span == 0) {
+    for (span = 0; span < input.size(); ++span) {
+      // We check for escaped-quotes before we check for quotes to minimise
+      // complexity. Once in a quote, we simply append everything without
+      // checking for the divider until the end quote is encountered (escaped
+      // quotes are processed normally).
+      if (not quote_.escaped.empty() and
+          current_token_is(input, span, quote_.escaped)) {
+        span += qsz - 1;
+      } else if (input[span] == quote_.on) {
+        in_quote = !in_quote;
+      } else if (in_quote or not current_token_is(input, span, divider_)) {
+        continue;
+      } else if (escapable_ and is_escaped(input, span)) {
+        continue;
+      } else {
+        if (not ignore_empty_tokens_ or span > 0) {
+          rval.push_back(input.substr(0, span));
+        }
+        input.remove_prefix(span + divider_.size());
+        span = 0;
+        break;
+      }
     }
   }
   // Due to the special handling rules of the truncate feature, we need
   // to add an additional layer of handling around empty tokens and buffer
-  if (ignore_empty_tokens_ && equals_from(divider_, from)) { ++from; }
+  if (ignore_empty_tokens_ and current_token_is(input, span, divider_)) {
+    input.remove_prefix(divider_.size());
+  }
   // Additionally, we do not want to include the final element if there is
   // actually no data remaining in the buffer/input string, even when we permit
   // empty tokens in our output.
-  if (rval.size() < max_outputs_ && !(buffer.empty() && from == input.size())) {
-    rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
+  if (rval.size() < max_outputs_ and not input.empty()) {
+    rval.emplace_back(input);
   }
 
-  if (reverse_search_) {
-    reverse(rval);
-    for (auto & str : rval) {
-      reverse(str);
+  //  if (reverse_search_) {
+  //    reverse(rval);
+  //    for (auto & str : rval) {
+  //      reverse(str);
+  //    }
+  //  }
+  return rval;
+}
+
+std::string Tokenizer::unescape(std::string_view token) const {
+  std::string rval;
+  rval.reserve(token.size());
+  for (size_t i = 0; i < token.size(); ++i) {
+    /*if (current_token_is(token, i, "\\\\")) {
+      rval.append(1, '\\');
+      ++i;
+    } else */
+    // The order of these tests is important!
+    // First we check if the current token is an escaped-quote - if so,
+    // replace it with the regular quote.
+    if (current_token_is(token, i, quote_.escaped)) {
+      rval.append(1, quote_.on);
+    } else if (token[i] == '\\' and current_token_is(token, i + 1, divider_)) {
+      // Then we check if we're looking at an escaped divider
+      rval.append(divider_);
+      i += divider_.size();
+    } else if (token[i] != quote_.on) {
+      // Lastly - we remote unescaped quotes
+      rval.append(1, token[i]);
     }
   }
   return rval;
 }
 
+std::vector<std::string> Tokenizer::operator()(std::string && input) const {
+  return promote(operator()(std::string_view(input)));
+}
+
+std::vector<std::string>
+EscapedTokenizer::operator()(std::string_view input) const {
+  auto rval = promote(Tokenizer::operator()(std::string_view(input)));
+  for (auto & token : rval) {
+    token = unescape(token);
+  }
+  return rval;
+}
+
+std::vector<std::string>
+EscapedTokenizer::operator()(std::string && input) const {
+  return operator()(std::string_view(input));
+}
+
 }

+ 2 - 0
string-utils.xcodeproj/project.pbxproj

@@ -70,6 +70,7 @@
 		CD266880252FFA7E00B3E667 /* tokenizer_test.cxx */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tokenizer_test.cxx; sourceTree = "<group>"; };
 		CD266886252FFAAE00B3E667 /* string_utils-test.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = "string_utils-test.xctest"; sourceTree = BUILT_PRODUCTS_DIR; };
 		CD26688A252FFAAE00B3E667 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		CD87CD7E29BCC63600C5949D /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = "<group>"; };
 		CDC883E228560A7C0088C91E /* any_of.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = any_of.h; sourceTree = "<group>"; };
 		CDC883E328560A7C0088C91E /* tokenizer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = "<group>"; };
 		CDC883E428560A7C0088C91E /* cast.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cast.h; sourceTree = "<group>"; };
@@ -98,6 +99,7 @@
 		CD266859252FF4B600B3E667 = {
 			isa = PBXGroup;
 			children = (
+				CD87CD7E29BCC63600C5949D /* README.md */,
 				CD26686D252FF51F00B3E667 /* GoogleMock.xcodeproj */,
 				CD26686A252FF4E100B3E667 /* string_utils */,
 				CDC883E028560A7C0088C91E /* include */,

+ 18 - 18
test/tokenizer_test.cxx

@@ -14,92 +14,92 @@ using namespace string_utils;
 
 TEST(TokenizerTest, SplitsStringOverToken) {
   std::string const input = "A.B.C.D";
-  std::vector<std::string> const expected{"A", "B", "C", "D"};
+  std::vector<std::string_view> const expected{"A", "B", "C", "D"};
   EXPECT_THAT(split(input, "."), expected);
 }
 
 TEST(TokenizerTest, SplitsStringUpToNTimes) {
   std::string const input = "A.B.C.D";
-  std::vector<std::string> const expected{"A", "B", "C.D"};
+  std::vector<std::string_view> const expected{"A", "B", "C.D"};
   EXPECT_THAT(split(input, ".", 3), expected);
 }
 
 TEST(TokenizerTest, IgnoresEmptyElementsAtStart) {
   std::string const input = ".A.B.C";
-  std::vector<std::string> const expected{"A", "B", "C"};
+  std::vector<std::string_view> const expected{"A", "B", "C"};
   EXPECT_THAT(split(input, ".", 3), expected);
 }
 
 TEST(TokenizerTest, IgnoresEmptyElements) {
   std::string const input = "A..B.C";
-  std::vector<std::string> const expected{"A", "B", "C"};
+  std::vector<std::string_view> const expected{"A", "B", "C"};
   EXPECT_THAT(split(input, ".", 3), expected);
 }
 
 TEST(TokenizerTest, IgnoresEmptyElementsOnEnd) {
   std::string const input = "A.B..C";
-  std::vector<std::string> const expected{"A", "B", "C"};
+  std::vector<std::string_view> const expected{"A", "B", "C"};
   EXPECT_THAT(split(input, ".", 3), expected);
 }
 
 TEST(TokenizerTest, TruncateDiscardsOverageInsteadOfNotParsingPast) {
   std::string const input = "A.B.C.D";
-  std::vector<std::string> const expected{"A", "B", "C"};
-  EXPECT_THAT(tokenizer(".").max_outputs(3).truncate(true)(input), expected);
+  std::vector<std::string_view> const expected{"A", "B", "C"};
+  EXPECT_THAT(Tokenizer(".").max_outputs(3).truncate(true)(input), expected);
 }
 
 TEST(TokenizerTest, EmptyIsPlacedCorrectlyWhenEnabled) {
   std::string const input = "A..B.C";
-  std::vector<std::string> const expected{"A", "", "B.C"};
-  EXPECT_THAT(tokenizer(".").max_outputs(3).ignore_empty_tokens(false)(input),
+  std::vector<std::string_view> const expected{"A", "", "B.C"};
+  EXPECT_THAT(Tokenizer(".").max_outputs(3).ignore_empty_tokens(false)(input),
               expected);
 }
 
 TEST(TokenizerTest, MaxSizeWithEmptyCanResultInTokenWithDividerPrefix) {
   std::string const input = "A.B..C";
-  std::vector<std::string> const expected{"A", "B", ".C"};
-  EXPECT_THAT(tokenizer(".").max_outputs(3).ignore_empty_tokens(false)(input),
+  std::vector<std::string_view> const expected{"A", "B", ".C"};
+  EXPECT_THAT(Tokenizer(".").max_outputs(3).ignore_empty_tokens(false)(input),
               expected);
 }
 
 TEST(TokenizerTest, EscapableTokensStickTogether) {
   std::string const input = R"(A B\ C)";
   std::vector<std::string> const expected{"A", "B C"};
-  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
+  EXPECT_THAT(EscapedTokenizer(" ")(input), expected);
 }
 
 TEST(TokenizerTest, CorrectlySplitsWhenEvenEscapes) {
   std::string const input = R"(A B\\ C)";
   std::vector<std::string> const expected{"A", R"(B\\)", "C"};
-  EXPECT_THAT(tokenizer(" ").escapable(true)(input), expected);
+  EXPECT_THAT(EscapedTokenizer(" ")(input), expected);
 }
 
 TEST(TokenizerTest, QuotesAreDiscarded) {
   std::string const input = R"(A,"B",C)";
   std::vector<std::string> const expected{"A", "B", "C"};
-  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
+  EXPECT_THAT(EscapedTokenizer(",", {'"'})(input), expected);
 }
 
 TEST(TokenizerTest, QuotedTokensStickTogether) {
   std::string const input = R"(A,"B,C")";
   std::vector<std::string> const expected{"A", "B,C"};
-  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
+  EXPECT_THAT(EscapedTokenizer(",", {'"'})(input), expected);
 }
 
 TEST(TokenizerTest, QuotedTokensAreEscapable) {
   std::string const input = R"(A,"B\",C")";
   std::vector<std::string> const expected{"A", "B\",C"};
-  EXPECT_THAT(tokenizer(",", {'"', "\\\""})(input), expected);
+  EXPECT_THAT(EscapedTokenizer(",", {'"', "\\\""})(input), expected);
 }
 
 TEST(TokenizerTest, QuoteTokenLiteralIsApplicable) {
   std::string const input = R"(A,"B"",C")";
   std::vector<std::string> const expected{"A", "B\",C"};
-  EXPECT_THAT(tokenizer(",", {'"', "\"\""})(input), expected);
+  EXPECT_THAT(EscapedTokenizer(",", {'"', "\"\""})(input), expected);
 }
 
 TEST(TokenizerTest, QuotesDontNeedToBeAtStartAndEnd) {
   std::string const input = R"(A,B",C")";
   std::vector<std::string> const expected{"A", "B,C"};
-  EXPECT_THAT(tokenizer(",", {'"'})(input), expected);
+  EXPECT_THAT(EscapedTokenizer(",", {'"'})(input), expected);
 }