|
@@ -0,0 +1,103 @@
|
|
|
|
|
+#include <abnf/code_point.h>
|
|
|
|
|
+
|
|
|
|
|
+#include <algorithm>
|
|
|
|
|
+#include <array>
|
|
|
|
|
+#include <cstddef>
|
|
|
|
|
+#include <cstdint>
|
|
|
|
|
+#include <cstring>
|
|
|
|
|
+#include <stdexcept>
|
|
|
|
|
+#include <string>
|
|
|
|
|
+
|
|
|
|
|
+namespace {
|
|
|
|
|
+struct bytes {
|
|
|
|
|
+ constexpr size_t ordinal() const { return width() - 1; }
|
|
|
|
|
+ constexpr size_t width() const { return b4 ? 4 : (b3 ? 3 : (b2 ? 2 : 1)); }
|
|
|
|
|
+
|
|
|
|
|
+ union {
|
|
|
|
|
+ struct {
|
|
|
|
|
+ uint32_t reserved : 8;
|
|
|
|
|
+ uint32_t u : 4;
|
|
|
|
|
+ uint32_t v : 4;
|
|
|
|
|
+ uint32_t w : 4;
|
|
|
|
|
+ uint32_t x : 4;
|
|
|
|
|
+ uint32_t y : 4;
|
|
|
|
|
+ uint32_t z : 4;
|
|
|
|
|
+ };
|
|
|
|
|
+ struct {
|
|
|
|
|
+ uint32_t b2 : 25;
|
|
|
|
|
+ uint32_t _2 : 7;
|
|
|
|
|
+ };
|
|
|
|
|
+ struct {
|
|
|
|
|
+ uint32_t b3 : 21;
|
|
|
|
|
+ uint32_t _3 : 11;
|
|
|
|
|
+ };
|
|
|
|
|
+ struct {
|
|
|
|
|
+ uint32_t b4 : 16;
|
|
|
|
|
+ uint32_t _4 : 16;
|
|
|
|
|
+ };
|
|
|
|
|
+ };
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+struct utf8_bits {
|
|
|
|
|
+ uint32_t filter;
|
|
|
|
|
+ uint32_t shift;
|
|
|
|
|
+ uint8_t bits[4];
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+constexpr size_t MAX_WIDTH = 4UL;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+#define SHIFT(X, BY) (data.X << BY)
|
|
|
|
|
+#define SPLIT_SHIFT(X, BY) \
|
|
|
|
|
+ ((data.X & 0xF0) << (BY + 2)) | ((data.X & 0x0F) << (BY))
|
|
|
|
|
+
|
|
|
|
|
+static constexpr std::array<utf8_bits, MAX_WIDTH> g_bits{{
|
|
|
|
|
+ {.filter = 0b00000000'000000000'000000000'00000000,
|
|
|
|
|
+ .shift = 24,
|
|
|
|
|
+ .bits = {7, 0, 0, 0}},
|
|
|
|
|
+ {.filter = 0b110'00000'10'000000'00000000'00000000,
|
|
|
|
|
+ .shift = 16,
|
|
|
|
|
+ .bits = {5, 6, 0, 0}},
|
|
|
|
|
+ {.filter = 0b1110'0000'10'000000'10'000000'00000000,
|
|
|
|
|
+ .shift = 8,
|
|
|
|
|
+ .bits = {4, 6, 6, 0}},
|
|
|
|
|
+ {.filter = 0b11110'000'10'000000'10'000000'10'000000,
|
|
|
|
|
+ .shift = 0,
|
|
|
|
|
+ .bits = {3, 6, 6, 6}},
|
|
|
|
|
+}};
|
|
|
|
|
+
|
|
|
|
|
+namespace abnf {
|
|
|
|
|
+code_point::code_point(std::string_view str) {
|
|
|
|
|
+ char data[MAX_WIDTH] = {'\0'};
|
|
|
|
|
+ std::memcpy(&data, str.data(), std::min(sizeof(data), str.size()));
|
|
|
|
|
+ for (auto const & [filter, shift, bits] : g_bits) {
|
|
|
|
|
+ if ((((*reinterpret_cast<uint32_t *>(data) & filter) >> shift) << shift) !=
|
|
|
|
|
+ filter) {
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ for (size_t i = MAX_WIDTH, counter = 0; i-- > 0; counter += bits[i]) {
|
|
|
|
|
+ char byte = ((0xFF << bits[i]) ^ 0xFF) & data[i];
|
|
|
|
|
+ value_ |= (byte << counter);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+size_t code_point::width() const {
|
|
|
|
|
+ return reinterpret_cast<bytes const &>(value_).width();
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+code_point::operator std::string() const {
|
|
|
|
|
+ bytes const data = reinterpret_cast<bytes const &>(value_);
|
|
|
|
|
+ if (data.reserved) {
|
|
|
|
|
+ throw std::domain_error("Illegal Codepoint (>0x10FFFF)");
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ uint32_t bdata = (SHIFT(u, 26) | SPLIT_SHIFT(v, 20) | SHIFT(w, 16) |
|
|
|
|
|
+ SHIFT(x, 10) | SHIFT(z, 0));
|
|
|
|
|
+ bdata |= data.b2 ? SPLIT_SHIFT(y, 4) : SHIFT(y, 4);
|
|
|
|
|
+
|
|
|
|
|
+ auto [filter, shift, _] = g_bits.at(data.ordinal());
|
|
|
|
|
+ bdata = (bdata | filter) << shift;
|
|
|
|
|
+ return {reinterpret_cast<char *>(&bdata), data.width()};
|
|
|
|
|
+}
|
|
|
|
|
+}
|