tokenizer_test.cxx 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. //
  2. // tokenizer_test.cxx
  3. // string-utils
  4. //
  5. // Created by Sam Jaffe on 10/8/20.
  6. // Copyright © 2020 Sam Jaffe. All rights reserved.
  7. //
  8. #include "string_utils/tokenizer.h"
  9. #include "xcode_gtest_helper.h"
  10. using namespace string_utils;
  11. TEST(TokenizerTest, SplitsStringOverToken) {
  12. std::string const input = "A.B.C.D";
  13. std::vector<std::string_view> const expected{"A", "B", "C", "D"};
  14. EXPECT_THAT(split(input, "."), expected);
  15. }
  16. TEST(TokenizerTest, SplitsStringUpToNTimes) {
  17. std::string const input = "A.B.C.D";
  18. std::vector<std::string_view> const expected{"A", "B", "C.D"};
  19. EXPECT_THAT(split(input, ".", 3), expected);
  20. }
  21. TEST(TokenizerTest, IgnoresEmptyElementsAtStart) {
  22. std::string const input = ".A.B.C";
  23. std::vector<std::string_view> const expected{"A", "B", "C"};
  24. EXPECT_THAT(split(input, ".", 3), expected);
  25. }
  26. TEST(TokenizerTest, IgnoresEmptyElements) {
  27. std::string const input = "A..B.C";
  28. std::vector<std::string_view> const expected{"A", "B", "C"};
  29. EXPECT_THAT(split(input, ".", 3), expected);
  30. }
  31. TEST(TokenizerTest, IgnoresEmptyElementsOnEnd) {
  32. std::string const input = "A.B..C";
  33. std::vector<std::string_view> const expected{"A", "B", "C"};
  34. EXPECT_THAT(split(input, ".", 3), expected);
  35. }
  36. TEST(TokenizerTest, TruncateDiscardsOverageInsteadOfNotParsingPast) {
  37. std::string const input = "A.B.C.D";
  38. std::vector<std::string_view> const expected{"A", "B", "C"};
  39. EXPECT_THAT(Tokenizer(".").max_outputs(3).truncate(true)(input), expected);
  40. }
  41. TEST(TokenizerTest, RTruncateDiscardsOverageInsteadOfNotParsingPast) {
  42. std::string const input = "A.B.C.D";
  43. std::vector<std::string_view> const expected{"B", "C", "D"};
  44. EXPECT_THAT(
  45. Tokenizer(".").max_outputs(3).truncate(true).reverse_search(true)(input),
  46. expected);
  47. }
  48. TEST(TokenizerTest, EmptyIsPlacedCorrectlyWhenEnabled) {
  49. std::string const input = "A..B.C";
  50. std::vector<std::string_view> const expected{"A", "", "B.C"};
  51. EXPECT_THAT(Tokenizer(".").max_outputs(3).ignore_empty_tokens(false)(input),
  52. expected);
  53. }
  54. TEST(TokenizerTest, MaxSizeWithEmptyCanResultInTokenWithDividerPrefix) {
  55. std::string const input = "A.B..C";
  56. std::vector<std::string_view> const expected{"A", "B", ".C"};
  57. EXPECT_THAT(Tokenizer(".").max_outputs(3).ignore_empty_tokens(false)(input),
  58. expected);
  59. }
  60. TEST(TokenizerTest, EscapableTokensStickTogether) {
  61. std::string const input = R"(A B\ C)";
  62. std::vector<std::string> const expected{"A", "B C"};
  63. EXPECT_THAT(EscapedTokenizer(" ")(input), expected);
  64. }
  65. TEST(TokenizerTest, CorrectlySplitsWhenEvenEscapes) {
  66. std::string const input = R"(A B\\ C)";
  67. std::vector<std::string> const expected{"A", R"(B\\)", "C"};
  68. EXPECT_THAT(EscapedTokenizer(" ")(input), expected);
  69. }
  70. TEST(TokenizerTest, QuotesAreDiscarded) {
  71. std::string const input = R"(A,"B",C)";
  72. std::vector<std::string> const expected{"A", "B", "C"};
  73. EXPECT_THAT(EscapedTokenizer(",", {'"'})(input), expected);
  74. }
  75. TEST(TokenizerTest, QuotedTokensStickTogether) {
  76. std::string const input = R"(A,"B,C")";
  77. std::vector<std::string> const expected{"A", "B,C"};
  78. EXPECT_THAT(EscapedTokenizer(",", {'"'})(input), expected);
  79. }
  80. TEST(TokenizerTest, QuotedTokensAreEscapable) {
  81. std::string const input = R"(A,"B\",C")";
  82. std::vector<std::string> const expected{"A", "B\",C"};
  83. EXPECT_THAT(EscapedTokenizer(",", {'"', "\\\""})(input), expected);
  84. }
  85. TEST(TokenizerTest, QuoteTokenLiteralIsApplicable) {
  86. std::string const input = R"(A,"B"",C")";
  87. std::vector<std::string> const expected{"A", "B\",C"};
  88. EXPECT_THAT(EscapedTokenizer(",", {'"', "\"\""})(input), expected);
  89. }
  90. TEST(TokenizerTest, QuotesDontNeedToBeAtStartAndEnd) {
  91. std::string const input = R"(A,B",C")";
  92. std::vector<std::string> const expected{"A", "B,C"};
  93. EXPECT_THAT(EscapedTokenizer(",", {'"'})(input), expected);
  94. }