tokenizer.cxx 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. //
  2. // tokenizer.cxx
  3. // string-utils
  4. //
  5. // Created by Sam Jaffe on 10/8/20.
  6. // Copyright © 2020 Sam Jaffe. All rights reserved.
  7. //
  8. #include "string_utils/tokenizer.h"
  9. namespace string_utils {
  10. template <typename C> static void reverse(C & str) {
  11. std::reverse(str.begin(), str.end());
  12. }
  13. tokenizer::tokenizer(std::string divider, struct quote quote)
  14. : divider_(std::move(divider)), quote_(std::move(quote)) {}
  15. tokenizer & tokenizer::max_outputs(size_t new_max_outputs) {
  16. max_outputs_ = new_max_outputs;
  17. return *this;
  18. }
  19. tokenizer & tokenizer::truncate(bool new_truncate) {
  20. truncate_ = new_truncate;
  21. return *this;
  22. }
  23. tokenizer & tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
  24. ignore_empty_tokens_ = new_ignore_empty_tokens;
  25. return *this;
  26. }
  27. tokenizer & tokenizer::escapable(bool new_escapable) {
  28. escapable_ = new_escapable;
  29. return *this;
  30. }
  31. tokenizer & tokenizer::reverse_search(bool new_reverse) {
  32. if (reverse_search_ != new_reverse) {
  33. reverse(divider_);
  34. reverse(quote_.escaped);
  35. }
  36. reverse_search_ = new_reverse;
  37. return *this;
  38. }
  39. static std::size_t countback(std::string const & str, std::size_t p, char c) {
  40. if (p == 0 || str[p - 1] != c) return 0;
  41. return p - str.find_last_not_of(c, p - 1) - 1;
  42. }
  43. std::vector<std::string> tokenizer::operator()(std::string input) const {
  44. auto equals_from = [&input](std::string const & token, std::size_t from) {
  45. return token.size() + from < input.size() &&
  46. std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
  47. };
  48. if (reverse_search_) { reverse(input); }
  49. std::vector<std::string> rval;
  50. std::string buffer;
  51. buffer.reserve(input.size());
  52. // If max_outputs_ == infinite_outputs, this will be infinite enough to work
  53. // since we'll hit overflow on the string itself before this.
  54. std::size_t const max = max_outputs_ - !truncate_;
  55. std::size_t const qsz = quote_.escaped.size();
  56. std::size_t from = 0;
  57. bool in_quote{false};
  58. for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
  59. // We check for escaped-quotes before we check for quotes to minimise
  60. // complexity. Once in a quote, we simply append everything without checking
  61. // for the divider until the end quote is encountered (escaped quotes are
  62. // processed normally).
  63. if (qsz > 0 && equals_from(quote_.escaped, pos)) {
  64. buffer.append(1, quote_.on);
  65. pos += qsz - 1;
  66. } else if (input[pos] == quote_.on) {
  67. in_quote = !in_quote;
  68. } else if (in_quote || !equals_from(divider_, pos)) {
  69. buffer.append(1, input[pos]);
  70. } else if (escapable_ && countback(input, pos, '\\') % 2) {
  71. buffer.back() = input[pos];
  72. } else if (!in_quote) {
  73. if (!ignore_empty_tokens_ || buffer.size()) { rval.emplace_back(buffer); }
  74. from = pos + 1;
  75. buffer.clear();
  76. }
  77. }
  78. // Due to the special handling rules of the truncate feature, we need
  79. // to add an additional layer of handling around empty tokens and buffer
  80. if (ignore_empty_tokens_ && equals_from(divider_, from)) { ++from; }
  81. // Additionally, we do not want to include the final element if there is
  82. // actually no data remaining in the buffer/input string, even when we permit
  83. // empty tokens in our output.
  84. if (rval.size() < max_outputs_ && !(buffer.empty() && from == input.size())) {
  85. rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
  86. }
  87. if (reverse_search_) {
  88. reverse(rval);
  89. for (auto & str : rval) {
  90. reverse(str);
  91. }
  92. }
  93. return rval;
  94. }
  95. }