| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- //
- // tokenizer.hpp
- // string-utils
- //
- // Created by Sam Jaffe on 10/8/20.
- // Copyright © 2020 Sam Jaffe. All rights reserved.
- //
- #pragma once
- #include <string>
- #include <vector>
- #include "string_utils/forwards.h"
- namespace string_utils {
- class Tokenizer {
- public:
- static constexpr size_t const UNLIMITED = ~0ul;
- protected:
- struct Quote {
- char on;
- std::string_view escaped;
- };
- private:
- std::string_view divider_;
- Quote quote_{'\0', ""};
- size_t max_outputs_{UNLIMITED};
- bool truncate_{false};
- bool ignore_empty_tokens_{true};
- bool escapable_{false};
- bool reverse_search_{false};
- public:
- explicit Tokenizer(std::string_view divider);
- Tokenizer & max_outputs(size_t new_max_outputs);
- Tokenizer & truncate(bool new_truncate_overage);
- Tokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
- Tokenizer & reverse_search(bool new_reverse);
- [[nodiscard]] EscapedTokenizer escapable(Quote quote = Quote{'\0', ""}) const;
- std::vector<std::string> operator()(std::string && input) const;
- std::vector<std::string_view> operator()(std::string_view input) const;
- protected:
- Tokenizer & quote(Quote quote);
- std::string unescape(std::string_view token) const;
- };
- class EscapedTokenizer : public Tokenizer {
- public:
- explicit EscapedTokenizer(std::string_view divider,
- Quote quote = Quote{'\0', ""});
- explicit EscapedTokenizer(Tokenizer impl, Quote quote);
- EscapedTokenizer & max_outputs(size_t new_max_outputs);
- EscapedTokenizer & truncate(bool new_truncate_overage);
- EscapedTokenizer & ignore_empty_tokens(bool new_ignore_empty_tokens);
- EscapedTokenizer & reverse_search(bool new_reverse);
- std::vector<std::string> operator()(std::string && input) const;
- std::vector<std::string> operator()(std::string_view input) const;
- };
- inline auto split(std::string const & data, std::string const & on,
- size_t max = Tokenizer::UNLIMITED) {
- return Tokenizer{on}.max_outputs(max)(data);
- }
- inline auto rsplit(std::string const & data, std::string const & on,
- size_t max = Tokenizer::UNLIMITED) {
- return Tokenizer{on}.reverse_search(true).max_outputs(max)(data);
- }
- }
|