tokenizer.cxx 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. //
  2. // tokenizer.cxx
  3. // string-utils
  4. //
  5. // Created by Sam Jaffe on 10/8/20.
  6. // Copyright © 2020 Sam Jaffe. All rights reserved.
  7. //
  8. #include "string_utils/tokenizer.h"
  9. namespace string_utils {
  10. tokenizer::tokenizer(std::string divider, struct quote quote)
  11. : divider_(std::move(divider)), quote_(std::move(quote)) {}
  12. tokenizer &tokenizer::max_outputs(size_t new_max_outputs) {
  13. max_outputs_ = new_max_outputs;
  14. return *this;
  15. }
  16. tokenizer &tokenizer::truncate(bool new_truncate) {
  17. truncate_ = new_truncate;
  18. return *this;
  19. }
  20. tokenizer &tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
  21. ignore_empty_tokens_ = new_ignore_empty_tokens;
  22. return *this;
  23. }
  24. tokenizer &tokenizer::escapable(bool new_escapable) {
  25. escapable_ = new_escapable;
  26. return *this;
  27. }
  28. static std::size_t countback(std::string const &str, std::size_t p, char c) {
  29. if (p == 0 || str[p - 1] != c) return 0;
  30. return p - str.find_last_not_of(c, p - 1) - 1;
  31. }
  32. std::size_t tokenizer::find(std::string &input, std::size_t from) const {
  33. auto pos = input.find(divider_, from);
  34. while (escapable_ && pos != std::string::npos &&
  35. countback(input, pos, '\\') % 2) {
  36. input.erase(pos - 1, 1);
  37. pos = input.find(divider_, pos);
  38. }
  39. return pos;
  40. }
  41. std::vector<std::string> tokenizer::operator()(std::string input) const {
  42. std::vector<std::string> rval;
  43. // If max_outputs_ == infinite_outputs, this will be infinite enough to work
  44. // since we'll hit overflow on the string itself before this.
  45. size_t const max = max_outputs_ - !truncate_;
  46. size_t i = 0;
  47. for (size_t n = find(input, i);
  48. n != std::string::npos && rval.size() < max;
  49. i = n + 1, n = find(input, i)) {
  50. if (ignore_empty_tokens_ && i == n) {
  51. continue;
  52. }
  53. rval.emplace_back(input.substr(i, n - i));
  54. }
  55. // Special Handling for the final token
  56. if (ignore_empty_tokens_ && input.find(divider_, i) == i) {
  57. ++i;
  58. }
  59. if (rval.size() < max_outputs_) {
  60. rval.emplace_back(input.substr(i));
  61. }
  62. return rval;
  63. }
  64. }