tokenizer.h 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. //
  2. // tokenizer.hpp
  3. // string-utils
  4. //
  5. // Created by Sam Jaffe on 10/8/20.
  6. // Copyright © 2020 Sam Jaffe. All rights reserved.
  7. //
  8. #pragma once
  9. #include <string>
  10. #include <vector>
  11. #include "string_utils/forwards.h"
  12. namespace string_utils {
  13. class Tokenizer {
  14. public:
  15. static constexpr size_t const UNLIMITED = ~0ul;
  16. protected:
  17. struct Quote {
  18. char on;
  19. std::string_view escaped;
  20. };
  21. private:
  22. std::string_view divider_;
  23. Quote quote_{'\0', ""};
  24. size_t max_outputs_{UNLIMITED};
  25. bool truncate_{false};
  26. bool ignore_empty_tokens_{true};
  27. bool escapable_{false};
  28. bool reverse_search_{false};
  29. public:
  30. explicit Tokenizer(std::string_view divider);
  31. Tokenizer & max_outputs(size_t new_max_outputs);
  32. Tokenizer & truncate(bool new_truncate_overage);
  33. Tokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
  34. Tokenizer & reverse_search(bool new_reverse);
  35. [[nodiscard]] EscapedTokenizer escapable(Quote quote = Quote{'\0', ""}) const;
  36. std::vector<std::string> operator()(std::string && input) const;
  37. std::vector<std::string_view> operator()(std::string_view input) const;
  38. protected:
  39. Tokenizer & quote(Quote quote);
  40. std::string unescape(std::string_view token) const;
  41. };
  42. class EscapedTokenizer : public Tokenizer {
  43. public:
  44. explicit EscapedTokenizer(std::string_view divider,
  45. Quote quote = Quote{'\0', ""});
  46. explicit EscapedTokenizer(Tokenizer impl, Quote quote);
  47. EscapedTokenizer & max_outputs(size_t new_max_outputs);
  48. EscapedTokenizer & truncate(bool new_truncate_overage);
  49. EscapedTokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
  50. EscapedTokenizer & reverse_search(bool new_reverse);
  51. std::vector<std::string> operator()(std::string && input) const;
  52. std::vector<std::string> operator()(std::string_view input) const;
  53. };
  54. inline auto split(std::string const & data, std::string const & on,
  55. size_t max = Tokenizer::UNLIMITED) {
  56. return Tokenizer{on}.max_outputs(max)(data);
  57. }
  58. inline auto rsplit(std::string const & data, std::string const & on,
  59. size_t max = Tokenizer::UNLIMITED) {
  60. return Tokenizer{on}.reverse_search(true).max_outputs(max)(data);
  61. }
  62. }