tokenizer.cxx 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. //
  2. // tokenizer.cxx
  3. // string-utils
  4. //
  5. // Created by Sam Jaffe on 10/8/20.
  6. // Copyright © 2020 Sam Jaffe. All rights reserved.
  7. //
  8. #include "string_utils/tokenizer.h"
  9. namespace string_utils {
  10. tokenizer::tokenizer(std::string divider, struct quote quote)
  11. : divider_(std::move(divider)), quote_(std::move(quote)) {}
  12. tokenizer & tokenizer::max_outputs(size_t new_max_outputs) {
  13. max_outputs_ = new_max_outputs;
  14. return *this;
  15. }
  16. tokenizer & tokenizer::truncate(bool new_truncate) {
  17. truncate_ = new_truncate;
  18. return *this;
  19. }
  20. tokenizer & tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
  21. ignore_empty_tokens_ = new_ignore_empty_tokens;
  22. return *this;
  23. }
  24. tokenizer & tokenizer::escapable(bool new_escapable) {
  25. escapable_ = new_escapable;
  26. return *this;
  27. }
  28. static std::size_t countback(std::string const & str, std::size_t p, char c) {
  29. if (p == 0 || str[p - 1] != c) return 0;
  30. return p - str.find_last_not_of(c, p - 1) - 1;
  31. }
  32. std::vector<std::string>
  33. tokenizer::operator()(std::string const & input) const {
  34. auto equals_from = [&input](std::string const & token, std::size_t from) {
  35. return token.size() + from < input.size() &&
  36. std::strncmp(input.c_str() + from, token.c_str(), token.size()) == 0;
  37. };
  38. std::vector<std::string> rval;
  39. std::string buffer;
  40. buffer.reserve(input.size());
  41. // If max_outputs_ == infinite_outputs, this will be infinite enough to work
  42. // since we'll hit overflow on the string itself before this.
  43. std::size_t const max = max_outputs_ - !truncate_;
  44. std::size_t const qsz = quote_.escaped.size();
  45. std::size_t from = 0;
  46. bool in_quote{false};
  47. for (std::size_t pos = 0; pos < input.size() && rval.size() < max; ++pos) {
  48. // We check for escaped-quotes before we check for quotes to minimise
  49. // complexity. Once in a quote, we simply append everything without checking
  50. // for the divider until the end quote is encountered (escaped quotes are
  51. // processed normally).
  52. if (qsz > 0 && equals_from(quote_.escaped, pos)) {
  53. buffer.append(1, quote_.on);
  54. pos += qsz - 1;
  55. } else if (input[pos] == quote_.on) {
  56. in_quote = !in_quote;
  57. } else if (in_quote || !equals_from(divider_, pos)) {
  58. buffer.append(1, input[pos]);
  59. } else if (escapable_ && countback(input, pos, '\\') % 2) {
  60. buffer.back() = input[pos];
  61. } else if (!in_quote) {
  62. if (!ignore_empty_tokens_ || buffer.size()) { rval.emplace_back(buffer); }
  63. from = pos + 1;
  64. buffer.clear();
  65. }
  66. }
  67. // Due to the special handling rules of the truncate feature, we need
  68. // to add an additional layer of handling around empty tokens and buffer
  69. if (ignore_empty_tokens_ && equals_from(divider_, from)) { ++from; }
  70. // Additionally, we do not want to include the final element if there is
  71. // actually no data remaining in the buffer/input string, even when we permit
  72. // empty tokens in our output.
  73. if (rval.size() < max_outputs_ && !(buffer.empty() && from == input.size())) {
  74. rval.emplace_back(buffer.empty() ? input.substr(from) : buffer);
  75. }
  76. return rval;
  77. }
  78. }