tokenizer.cxx 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. //
  2. // tokenizer.cxx
  3. // string-utils
  4. //
  5. // Created by Sam Jaffe on 10/8/20.
  6. // Copyright © 2020 Sam Jaffe. All rights reserved.
  7. //
  8. #include "string_utils/tokenizer.h"
  9. namespace {
  10. bool is_escaped(std::string_view str, std::size_t p) {
  11. if (p == 0 || str[p - 1] != '\\') { return false; }
  12. return ((p - str.find_last_not_of('\\', p - 1) - 1) & 1) == 1;
  13. }
  14. auto promote(std::vector<std::string_view> input) {
  15. return std::vector<std::string>(input.cbegin(), input.cend());
  16. }
  17. bool current_token_is(std::string_view token, size_t offset,
  18. std::string_view find) {
  19. return find.size() > 0 and token.compare(offset, find.size(), find) == 0;
  20. };
  21. }
  22. namespace string_utils {
  23. Tokenizer::Tokenizer(std::string_view divider) : divider_(divider) {}
  24. Tokenizer & Tokenizer::max_outputs(size_t new_max_outputs) {
  25. max_outputs_ = new_max_outputs;
  26. return *this;
  27. }
  28. Tokenizer & Tokenizer::truncate(bool new_truncate) {
  29. truncate_ = new_truncate;
  30. return *this;
  31. }
  32. Tokenizer & Tokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
  33. ignore_empty_tokens_ = new_ignore_empty_tokens;
  34. return *this;
  35. }
  36. Tokenizer & Tokenizer::reverse_search(bool new_reverse) {
  37. reverse_search_ = new_reverse;
  38. return *this;
  39. }
  40. EscapedTokenizer::EscapedTokenizer(std::string_view divider, Quote quote)
  41. : EscapedTokenizer(Tokenizer(divider), quote) {}
  42. EscapedTokenizer::EscapedTokenizer(Tokenizer tok, Quote quote)
  43. : Tokenizer(tok) {
  44. this->quote(quote);
  45. }
  46. EscapedTokenizer & EscapedTokenizer::max_outputs(size_t new_max_outputs) {
  47. Tokenizer::max_outputs(new_max_outputs);
  48. return *this;
  49. }
  50. EscapedTokenizer & EscapedTokenizer::truncate(bool new_truncate) {
  51. Tokenizer::truncate(new_truncate);
  52. return *this;
  53. }
  54. EscapedTokenizer &
  55. EscapedTokenizer::ignore_empty_tokens(bool new_ignore_empty_tokens) {
  56. Tokenizer::ignore_empty_tokens(new_ignore_empty_tokens);
  57. return *this;
  58. }
  59. EscapedTokenizer & EscapedTokenizer::reverse_search(bool new_reverse) {
  60. Tokenizer::reverse_search(new_reverse);
  61. return *this;
  62. }
  63. Tokenizer & Tokenizer::quote(Quote quote) {
  64. quote_ = quote;
  65. escapable_ = true;
  66. return *this;
  67. }
  68. EscapedTokenizer Tokenizer::escapable(Quote quote) const {
  69. return EscapedTokenizer(*this, quote);
  70. }
  71. std::vector<std::string_view>
  72. Tokenizer::operator()(std::string_view input) const {
  73. std::vector<std::string_view> rval;
  74. // If max_outputs_ == infinite_outputs, this will be infinite enough to work
  75. // since we'll hit overflow on the string itself before this.
  76. std::size_t const max = max_outputs_ - !truncate_;
  77. std::size_t const qsz = quote_.escaped.size();
  78. size_t span = 0;
  79. auto index = [this, &input, &span]() {
  80. return reverse_search_ ? input.size() - span - 1 : span;
  81. };
  82. bool in_quote{false};
  83. while (not input.empty() and rval.size() < max and span == 0) {
  84. for (span = 0; span < input.size(); ++span) {
  85. // We check for escaped-quotes before we check for quotes to minimise
  86. // complexity. Once in a quote, we simply append everything without
  87. // checking for the divider until the end quote is encountered (escaped
  88. // quotes are processed normally).
  89. if (not quote_.escaped.empty() and
  90. current_token_is(input, index(), quote_.escaped)) {
  91. span += qsz - 1;
  92. } else if (input[index()] == quote_.on) {
  93. in_quote = !in_quote;
  94. } else if (in_quote or not current_token_is(input, index(), divider_)) {
  95. continue;
  96. } else if (escapable_ and is_escaped(input, index())) {
  97. continue;
  98. } else if (reverse_search_) {
  99. if (not ignore_empty_tokens_ or span > 0) {
  100. rval.push_back(input.substr(index() + 1, span));
  101. }
  102. input.remove_suffix(span + divider_.size());
  103. span = 0;
  104. break;
  105. } else {
  106. if (not ignore_empty_tokens_ or span > 0) {
  107. rval.push_back(input.substr(0, span));
  108. }
  109. input.remove_prefix(span + divider_.size());
  110. span = 0;
  111. break;
  112. }
  113. }
  114. }
  115. // Due to the special handling rules of the truncate feature, we need
  116. // to add an additional layer of handling around empty tokens and buffer
  117. if (ignore_empty_tokens_ and current_token_is(input, span, divider_)) {
  118. input.remove_prefix(divider_.size());
  119. }
  120. // Additionally, we do not want to include the final element if there is
  121. // actually no data remaining in the buffer/input string, even when we permit
  122. // empty tokens in our output.
  123. if (rval.size() < max_outputs_ and not input.empty()) {
  124. rval.push_back(input);
  125. }
  126. if (reverse_search_) { std::reverse(rval.begin(), rval.end()); }
  127. return rval;
  128. }
  129. std::string Tokenizer::unescape(std::string_view token) const {
  130. std::string rval;
  131. rval.reserve(token.size());
  132. for (size_t i = 0; i < token.size(); ++i) {
  133. /*if (current_token_is(token, i, "\\\\")) {
  134. rval.append(1, '\\');
  135. ++i;
  136. } else */
  137. // The order of these tests is important!
  138. // First we check if the current token is an escaped-quote - if so,
  139. // replace it with the regular quote.
  140. if (current_token_is(token, i, quote_.escaped)) {
  141. rval.append(1, quote_.on);
  142. } else if (token[i] == '\\' and current_token_is(token, i + 1, divider_)) {
  143. // Then we check if we're looking at an escaped divider
  144. rval.append(divider_);
  145. i += divider_.size();
  146. } else if (token[i] != quote_.on) {
  147. // Lastly - we remote unescaped quotes
  148. rval.append(1, token[i]);
  149. }
  150. }
  151. return rval;
  152. }
  153. std::vector<std::string> Tokenizer::operator()(std::string && input) const {
  154. return promote(operator()(std::string_view(input)));
  155. }
  156. std::vector<std::string>
  157. EscapedTokenizer::operator()(std::string_view input) const {
  158. auto rval = promote(Tokenizer::operator()(std::string_view(input)));
  159. for (auto & token : rval) {
  160. token = unescape(token);
  161. }
  162. return rval;
  163. }
  164. std::vector<std::string>
  165. EscapedTokenizer::operator()(std::string && input) const {
  166. return operator()(std::string_view(input));
  167. }
  168. }